kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ale...@apache.org
Subject kudu git commit: consensus: support changing between VOTER and NON_VOTER
Date Tue, 24 Oct 2017 20:06:35 GMT
Repository: kudu
Updated Branches:
  refs/heads/master e938aaa21 -> 28a671365


consensus: support changing between VOTER and NON_VOTER

This patch implements changing from VOTER to NON_VOTER and vice-versa.

Added an integration test scenario for NON_VOTER --> VOTER,
VOTER --> NON_VOTER changes and other use cases.

Change-Id: I978cbed683b0e95a903f2bd2b57496aee71cb33a
Reviewed-on: http://gerrit.cloudera.org:8080/8297
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <todd@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/28a67136
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/28a67136
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/28a67136

Branch: refs/heads/master
Commit: 28a671365d6d38da966481daf937b3776e3d4852
Parents: e938aaa
Author: Mike Percy <mpercy@apache.org>
Authored: Wed Oct 18 01:12:26 2017 +0900
Committer: Todd Lipcon <todd@apache.org>
Committed: Tue Oct 24 18:16:36 2017 +0000

----------------------------------------------------------------------
 src/kudu/consensus/quorum_util-test.cc          |  10 +
 src/kudu/consensus/quorum_util.cc               |   6 +
 src/kudu/consensus/quorum_util.h                |   4 +
 src/kudu/consensus/raft_consensus.cc            |  61 ++++-
 src/kudu/consensus/raft_consensus.h             |   8 +
 .../integration-tests/cluster_itest_util.cc     |  35 ++-
 src/kudu/integration-tests/cluster_itest_util.h |  10 +
 .../raft_consensus_nonvoter-itest.cc            | 225 ++++++++++++++++++-
 8 files changed, 345 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/28a67136/src/kudu/consensus/quorum_util-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/quorum_util-test.cc b/src/kudu/consensus/quorum_util-test.cc
index 4b2ad0c..8864977 100644
--- a/src/kudu/consensus/quorum_util-test.cc
+++ b/src/kudu/consensus/quorum_util-test.cc
@@ -179,6 +179,16 @@ TEST(QuorumUtilTest, TestIsRaftConfigVoter) {
   ASSERT_FALSE(IsRaftConfigVoter("B", config));
   ASSERT_FALSE(IsRaftConfigVoter("C", config));
   ASSERT_FALSE(IsRaftConfigVoter(no_member_type_peer_uuid, config));
+
+  RaftPeerPB* peer_a;
+  ASSERT_OK(GetRaftConfigMember(&config, "A", &peer_a));
+  RaftPeerPB* peer_b;
+  ASSERT_OK(GetRaftConfigMember(&config, "B", &peer_b));
+  ASSERT_FALSE(ReplicaTypesEqual(*peer_a, *peer_b));
+  ASSERT_TRUE(ReplicaTypesEqual(*peer_b, *peer_b));
+  RaftPeerPB* peer_c;
+  ASSERT_OK(GetRaftConfigMember(&config, "C", &peer_c));
+  ASSERT_FALSE(ReplicaTypesEqual(*peer_b, *peer_c));
 }
 
 } // namespace consensus

http://git-wip-us.apache.org/repos/asf/kudu/blob/28a67136/src/kudu/consensus/quorum_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/quorum_util.cc b/src/kudu/consensus/quorum_util.cc
index f81f2a7..90d9671 100644
--- a/src/kudu/consensus/quorum_util.cc
+++ b/src/kudu/consensus/quorum_util.cc
@@ -98,6 +98,12 @@ bool RemoveFromRaftConfig(RaftConfigPB* config, const string& uuid)
{
   return true;
 }
 
+bool ReplicaTypesEqual(const RaftPeerPB& peer1, const RaftPeerPB& peer2) {
+  // TODO(mpercy): Include comparison of replica intentions once they are
+  // implemented.
+  return peer1.member_type() == peer2.member_type();
+}
+
 int CountVoters(const RaftConfigPB& config) {
   int voters = 0;
   for (const RaftPeerPB& peer : config.peers()) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/28a67136/src/kudu/consensus/quorum_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/quorum_util.h b/src/kudu/consensus/quorum_util.h
index 26b6e78..3bd0afc 100644
--- a/src/kudu/consensus/quorum_util.h
+++ b/src/kudu/consensus/quorum_util.h
@@ -56,6 +56,10 @@ Status GetRaftConfigLeader(ConsensusStatePB* cstate, RaftPeerPB** peer_pb);
 // Returns true on success.
 bool RemoveFromRaftConfig(RaftConfigPB* config, const std::string& uuid);
 
+// Returns true iff the two peers have equivalent replica types and associated
+// options.
+bool ReplicaTypesEqual(const RaftPeerPB& peer1, const RaftPeerPB& peer2);
+
 // Counts the number of voters in the configuration.
 int CountVoters(const RaftConfigPB& config);
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/28a67136/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 581bace..5f7f392 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -316,7 +316,7 @@ Status RaftConsensus::Start(const ConsensusBootstrapInfo& info,
       }
     }
 
-    // Now assume "follower" duties.
+    // Now assume non-leader replica duties.
     RETURN_NOT_OK(BecomeReplicaUnlocked(fd_initial_delta));
 
     SetStateUnlocked(kRunning);
@@ -585,15 +585,9 @@ Status RaftConsensus::BecomeReplicaUnlocked(boost::optional<MonoDelta>
fd_delta)
                                  << ToStringUnlocked();
   ClearLeaderUnlocked();
 
-  if (consensus::IsVoterRole(cmeta_->active_role())) {
-    // A voter should run failure detector, if not a leader.
-    EnableFailureDetector(std::move(fd_delta));
-  } else {
-    // A non-voter should not start leader elections. The leader failure
-    // detector should be re-enabled once the non-voter replica is promoted
-    // to voter replica.
-    DisableFailureDetector();
-  }
+  // Enable/disable leader failure detection if becoming VOTER/NON_VOTER replica
+  // correspondingly.
+  ToggleFailureDetector(std::move(fd_delta));
 
   // Now that we're a replica, we can allow voting for other nodes.
   withhold_votes_until_ = MonoTime::Min();
@@ -1648,8 +1642,31 @@ Status RaftConsensus::ChangeConfig(const ChangeConfigRequestPB&
req,
 
       // TODO(mpercy): Support changing between VOTER and NON_VOTER.
       case CHANGE_REPLICA_TYPE:
+        if (server.member_type() == RaftPeerPB::UNKNOWN_MEMBER_TYPE) {
+          return Status::InvalidArgument("Cannot change replica type to UNKNOWN_MEMBER_TYPE");
+        }
+        if (server_uuid == peer_uuid()) {
+          return Status::InvalidArgument(
+              Substitute("Cannot change the replica type of peer $0 because it is the leader.
"
+                         "Force another leader to be elected to modify the type of this replica.
"
+                         "Consensus state: $1",
+                         server_uuid,
+                         SecureShortDebugString(cmeta_->ToConsensusStatePB())));
+        }
+        RaftPeerPB* peer_pb;
+        RETURN_NOT_OK(GetRaftConfigMember(&new_config, server_uuid, &peer_pb));
+        // Validate the changes.
+        if (ReplicaTypesEqual(*peer_pb, server)) {
+          return Status::InvalidArgument("Cannot change replica type to same type");
+        }
+        peer_pb->set_member_type(server.member_type());
+        // TODO(mpercy): Copy over replica intentions when implemented.
+        break;
+
       default:
-        return Status::NotSupported("Role change is not yet implemented.");
+        return Status::NotSupported(Substitute(
+            "$0: unsupported type of configuration change",
+            ChangeConfigType_Name(type)));
     }
 
     RETURN_NOT_OK(ReplicateConfigChangeUnlocked(
@@ -2401,6 +2418,10 @@ void RaftConsensus::CompleteConfigChangeRoundUnlocked(ConsensusRound*
round, con
       LOG_WITH_PREFIX_UNLOCKED(INFO) << "Aborting config change with OpId "
                                      << op_id << ": " << status.ToString();
       cmeta_->clear_pending_config();
+
+      // Disable leader failure detection if transitioning from VOTER to
+      // NON_VOTER and vice versa.
+      ToggleFailureDetector();
     } else {
       LOG_WITH_PREFIX_UNLOCKED(INFO)
           << "Skipping abort of non-pending config change with OpId "
@@ -2448,7 +2469,6 @@ void RaftConsensus::CompleteConfigChangeRoundUnlocked(ConsensusRound*
round, con
   }
 }
 
-
 void RaftConsensus::EnableFailureDetector(boost::optional<MonoDelta> delta) {
   if (PREDICT_TRUE(FLAGS_enable_leader_failure_detection)) {
     failure_detector_->Start(std::move(delta));
@@ -2461,6 +2481,18 @@ void RaftConsensus::DisableFailureDetector() {
   }
 }
 
+void RaftConsensus::ToggleFailureDetector(boost::optional<MonoDelta> delta) {
+  if (IsRaftConfigVoter(peer_uuid(), cmeta_->ActiveConfig())) {
+    // A non-leader voter replica should run failure detector.
+    EnableFailureDetector(std::move(delta));
+  } else {
+    // A non-voter should not start leader elections. The leader failure
+    // detector should be re-enabled once the non-voter replica is promoted
+    // to voter replica.
+    DisableFailureDetector();
+  }
+}
+
 void RaftConsensus::SnoozeFailureDetector(boost::optional<string> reason_for_log,
                                           boost::optional<MonoDelta> delta) {
   if (PREDICT_TRUE(failure_detector_ && FLAGS_enable_leader_failure_detection)) {
@@ -2583,6 +2615,11 @@ Status RaftConsensus::SetPendingConfigUnlocked(const RaftConfigPB&
new_config) {
         << "New pending config: " << SecureShortDebugString(new_config);
   }
   cmeta_->set_pending_config(new_config);
+
+  // Disable leader failure detection if transitioning from VOTER to NON_VOTER
+  // and vice versa.
+  ToggleFailureDetector();
+
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/28a67136/src/kudu/consensus/raft_consensus.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h
index 5ab93b8..7229bb7 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -563,6 +563,14 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
   // If the failure detector is already disabled, has no effect.
   void DisableFailureDetector();
 
+  // Disable leader failure detector if transitioning from VOTER to NON_VOTER,
+  // and vice versa. The decision is based on the type of membership of the
+  // peer in the active Raft configuration.
+  //
+  // If it's time to enable the leader failure detection, the specified
+  // 'delta' value is used as described in EnableFailureDetector()'s comment.
+  void ToggleFailureDetector(boost::optional<MonoDelta> delta = boost::none);
+
   // "Reset" the failure detector to indicate leader activity.
   //
   // When this is called a failure is guaranteed not to be detected before

http://git-wip-us.apache.org/repos/asf/kudu/blob/28a67136/src/kudu/integration-tests/cluster_itest_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_itest_util.cc b/src/kudu/integration-tests/cluster_itest_util.cc
index e956804..cac682d 100644
--- a/src/kudu/integration-tests/cluster_itest_util.cc
+++ b/src/kudu/integration-tests/cluster_itest_util.cc
@@ -738,7 +738,40 @@ Status RemoveServer(const TServerDetails* leader,
   rpc.set_timeout(timeout);
   RETURN_NOT_OK(leader->consensus_proxy->ChangeConfig(req, &resp, &rpc));
   if (resp.has_error()) {
-    if (error_code) *error_code = resp.error().code();
+    if (error_code) {
+      *error_code = resp.error().code();
+    }
+    return StatusFromPB(resp.error().status());
+  }
+  return Status::OK();
+}
+
+Status ChangeReplicaType(const TServerDetails* leader,
+                         const std::string& tablet_id,
+                         const TServerDetails* target_replica,
+                         RaftPeerPB::MemberType replica_type,
+                         const MonoDelta& timeout,
+                         const boost::optional<int64_t>& cas_config_index,
+                         tserver::TabletServerErrorPB::Code* error_code) {
+  ChangeConfigRequestPB req;
+  req.set_dest_uuid(leader->uuid());
+  req.set_tablet_id(tablet_id);
+  req.set_type(consensus::CHANGE_REPLICA_TYPE);
+  if (cas_config_index) {
+    req.set_cas_config_opid_index(*cas_config_index);
+  }
+  RaftPeerPB* peer = req.mutable_server();
+  peer->set_permanent_uuid(target_replica->uuid());
+  peer->set_member_type(replica_type);
+
+  ChangeConfigResponsePB resp;
+  RpcController rpc;
+  rpc.set_timeout(timeout);
+  RETURN_NOT_OK(leader->consensus_proxy->ChangeConfig(req, &resp, &rpc));
+  if (resp.has_error()) {
+    if (error_code) {
+      *error_code = resp.error().code();
+    }
     return StatusFromPB(resp.error().status());
   }
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/kudu/blob/28a67136/src/kudu/integration-tests/cluster_itest_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_itest_util.h b/src/kudu/integration-tests/cluster_itest_util.h
index 63c28c7..9252dc9 100644
--- a/src/kudu/integration-tests/cluster_itest_util.h
+++ b/src/kudu/integration-tests/cluster_itest_util.h
@@ -288,6 +288,16 @@ Status RemoveServer(const TServerDetails* leader,
                     const boost::optional<int64_t>& cas_config_index = boost::none,
                     tserver::TabletServerErrorPB::Code* error_code = nullptr);
 
+// Change type of the given replica to the specified type.
+// The RPC request is sent to 'leader'.
+Status ChangeReplicaType(const TServerDetails* leader,
+                         const std::string& tablet_id,
+                         const TServerDetails* target_replica,
+                         consensus::RaftPeerPB::MemberType replica_type,
+                         const MonoDelta& timeout,
+                         const boost::optional<int64_t>& cas_config_index = boost::none,
+                         tserver::TabletServerErrorPB::Code* error_code = nullptr);
+
 // Get the list of tablets from the remote server.
 Status ListTablets(const TServerDetails* ts,
                    const MonoDelta& timeout,

http://git-wip-us.apache.org/repos/asf/kudu/blob/28a67136/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
index b4287d7..6b68914 100644
--- a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
@@ -16,12 +16,14 @@
 // under the License.
 
 #include <cstdint>
+#include <ostream>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include <gflags/gflags_declare.h>
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "kudu/consensus/consensus.pb.h"
@@ -97,6 +99,14 @@ class RaftConsensusNonVoterITest : public RaftConsensusITestBase {
   Status RemoveReplica(const string& tablet_id,
                        const TServerDetails* replica,
                        const MonoDelta& timeout);
+
+  // Change replica membership to the specified type, i.e. promote a replica
+  // in case of RaftPeerPB::VOTER member_type, and demote a replica in case of
+  // RaftPeerPB::NON_VOTER member_type.
+  Status ChangeReplicaMembership(RaftPeerPB::MemberType member_type,
+                                 const string& tablet_id,
+                                 const TServerDetails* replica,
+                                 const MonoDelta& timeout);
 };
 
 Status RaftConsensusNonVoterITest::GetTabletCopySourceSessionsCount(
@@ -142,6 +152,22 @@ Status RaftConsensusNonVoterITest::RemoveReplica(const string& tablet_id,
   return RemoveServer(leader, tablet_id, replica, timeout);
 }
 
+Status RaftConsensusNonVoterITest::ChangeReplicaMembership(
+    RaftPeerPB::MemberType member_type,
+    const string& tablet_id,
+    const TServerDetails* replica,
+    const MonoDelta& timeout) {
+  TServerDetails* leader = nullptr;
+  RETURN_NOT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
+
+  // Wait for at least one operation committed by the leader in current term.
+  // Otherwise, any Raft configuration change attempt might end up with error
+  // 'Illegal state: Leader has not yet committed an operation in its own term'.
+  RETURN_NOT_OK(WaitForOpFromCurrentTerm(leader, tablet_id,
+                                         consensus::COMMITTED_OPID, timeout));
+  return ChangeReplicaType(leader, tablet_id, replica, member_type, timeout);
+}
+
 // Ensure that adding a NON_VOTER replica is properly handled by the system:
 //
 //   * Updating Raft configuration for tablet by adding a NON_VOTER replica
@@ -378,7 +404,7 @@ TEST_F(RaftConsensusNonVoterITest, AddThenRemoveNonVoterReplica) {
       new_replica_idx, tablet_id, { TABLET_DATA_TOMBSTONED }, kTimeout));
 
   // The added and then removed tablet replica should be gone, and the master
-  // should report approrpiate replica count at this point. The tablet leader
+  // should report appropriate replica count at this point. The tablet leader
   // should be established.
   bool has_leader;
   master::TabletLocationsPB tablet_locations;
@@ -409,6 +435,11 @@ TEST_F(RaftConsensusNonVoterITest, AddThenRemoveNonVoterReplica) {
 //  * does not start leader elections
 //  * returns an error on RunLeaderElection() RPC call
 TEST_F(RaftConsensusNonVoterITest, NonVoterReplicasDoNotVote) {
+  if (!AllowSlowTests()) {
+    LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
+    return;
+  }
+
   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const int kOriginalReplicasNum = 2;
   const int kHbIntervalMs = 64;
@@ -529,5 +560,197 @@ TEST_F(RaftConsensusNonVoterITest, NonVoterReplicasDoNotVote) {
   }
 }
 
+// Test that it's possible to promote a NON_VOTER replica into a VOTER one
+// and demote a VOTER replica into NON_VOTER (if not a leader). The new VOTER
+// replica should be able to participate in elections, and the quorum should
+// change accordingly. Promote and demote a replica under active workload.
+// Promote a replica and remove it, making sure it gets tombstoned.
+TEST_F(RaftConsensusNonVoterITest, PromoteAndDemote) {
+  if (!AllowSlowTests()) {
+    LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
+    return;
+  }
+
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
+  const int kInitialReplicasNum = 3;
+  const int kHbIntervalMs = 50;
+  const vector<string> kTserverFlags = {
+    Substitute("--raft_heartbeat_interval_ms=$0", kHbIntervalMs),
+  };
+
+  FLAGS_num_tablet_servers = 4;
+  FLAGS_num_replicas = kInitialReplicasNum;
+  NO_FATALS(BuildAndStart(kTserverFlags));
+  ASSERT_EQ(4, tablet_servers_.size());
+  ASSERT_EQ(kInitialReplicasNum, tablet_replicas_.size());
+
+  const string& tablet_id = tablet_id_;
+  TabletServerMap replica_servers;
+  for (const auto& e : tablet_replicas_) {
+    if (e.first == tablet_id) {
+      replica_servers.emplace(e.second->uuid(), e.second);
+    }
+  }
+  ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());
+
+  ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));
+
+  TServerDetails* new_replica = nullptr;
+  for (const auto& ts : tablet_servers_) {
+    if (replica_servers.find(ts.first) == replica_servers.end()) {
+      new_replica = ts.second;
+      break;
+    }
+  }
+  ASSERT_NE(nullptr, new_replica);
+
+  ASSERT_OK(AddReplica(tablet_id, new_replica, RaftPeerPB::NON_VOTER, kTimeout));
+
+  {
+    // It should not be possible to demote the tablet leader.
+    TServerDetails* leader;
+    ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
+    const Status s_demote = ChangeReplicaMembership(RaftPeerPB::NON_VOTER,
+                                                    tablet_id, leader, kTimeout);
+    const auto s_demote_str = s_demote.ToString();
+    ASSERT_TRUE(s_demote.IsInvalidArgument()) << s_demote_str;
+    ASSERT_STR_MATCHES(s_demote_str,
+        "Cannot change the replica type of peer .* because it is the leader");
+
+    // It should not be possible to promote a leader replica since it's
+    // already a voter.
+    const Status s_promote = ChangeReplicaMembership(RaftPeerPB::VOTER,
+                                                     tablet_id, leader, kTimeout);
+    const auto s_promote_str = s_promote.ToString();
+    ASSERT_TRUE(s_promote.IsInvalidArgument()) << s_promote_str;
+    ASSERT_STR_MATCHES(s_promote_str,
+        "Cannot change the replica type of peer .* because it is the leader");
+  }
+
+  {
+    // It should not be possible to demote a non-voter.
+    Status s = ChangeReplicaMembership(RaftPeerPB::NON_VOTER,
+                                       tablet_id, new_replica, kTimeout);
+    const auto s_str = s.ToString();
+    ASSERT_TRUE(s.IsInvalidArgument()) << s_str;
+    ASSERT_STR_CONTAINS(s_str, "Cannot change replica type to same type");
+  }
+
+  // It should be possible to promote a non-voter to voter.
+  ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::VOTER, tablet_id,
+                                    new_replica, kTimeout));
+
+  {
+    // It should not be possible to promote a voter since it's a voter already.
+    Status s = ChangeReplicaMembership(RaftPeerPB::VOTER,
+                                       tablet_id, new_replica, kTimeout);
+    const auto s_str = s.ToString();
+    ASSERT_TRUE(s.IsInvalidArgument()) << s_str;
+    ASSERT_STR_CONTAINS(s_str, "Cannot change replica type to same type");
+  }
+
+  // Make sure the promoted replica is in the quorum.
+  {
+    // Pause both the current leader and the new voter, and make sure
+    // no new leader can be elected since the majority is now 3 out of 4 voters.
+    TServerDetails* leader;
+    ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
+    ExternalTabletServer* leader_ts =
+        cluster_->tablet_server_by_uuid(leader->uuid());
+    ASSERT_OK(leader_ts->Pause());
+    auto cleanup_leader = MakeScopedCleanup([&]() {
+      ASSERT_OK(leader_ts->Resume());
+    });
+
+    ExternalTabletServer* new_replica_ts =
+        cluster_->tablet_server_by_uuid(new_replica->uuid());
+    ASSERT_OK(new_replica_ts->Pause());
+    auto cleanup_new_replica = MakeScopedCleanup([&]() {
+      ASSERT_OK(new_replica_ts->Resume());
+    });
+
+    {
+      TServerDetails* leader;
+      Status s = GetLeaderReplicaWithRetries(tablet_id, &leader, 10);
+      const auto s_str = s.ToString();
+      ASSERT_TRUE(s.IsNotFound()) << s_str;
+      ASSERT_STR_CONTAINS(s_str, "leader replica not found");
+    }
+
+    // Resume the newly promoted replica: there should be 3 out of 4 voters
+    // available now, so they should be able to elect a leader among them.
+    ASSERT_OK(new_replica_ts->Resume());
+    ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
+
+    // It should be possible to demote a replica back: 2 out of 3 voters
+    // of the result configuration are available, so 2 active voters out of 3
+    // (1 is still paused) and 1 non-votes in the tablet constitute a functional
+    // tablet, so they should be able to elect a leader replica among them.
+    ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::NON_VOTER,
+                                      tablet_id, new_replica, kTimeout));
+    ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
+  }
+
+  // Promote/demote a replica under active workload.
+  {
+    TestWorkload workload(cluster_.get());
+    workload.set_table_name(kTableId);
+    workload.Setup();
+    workload.Start();
+    while (workload.rows_inserted() < 10) {
+      SleepFor(MonoDelta::FromMilliseconds(10));
+    }
+
+    ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::VOTER,
+                                      tablet_id, new_replica, kTimeout));
+    auto rows_inserted = workload.rows_inserted();
+    while (workload.rows_inserted() < rows_inserted * 2) {
+      SleepFor(MonoDelta::FromMilliseconds(10));
+    }
+
+    ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::NON_VOTER,
+                                      tablet_id, new_replica, kTimeout));
+    rows_inserted = workload.rows_inserted();
+    while (workload.rows_inserted() < rows_inserted * 2) {
+      SleepFor(MonoDelta::FromMilliseconds(10));
+    }
+    workload.StopAndJoin();
+
+    ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));
+
+    ClusterVerifier v(cluster_.get());
+    NO_FATALS(v.CheckCluster());
+    NO_FATALS(v.CheckRowCount(workload.table_name(),
+                              ClusterVerifier::EXACTLY,
+                              workload.rows_inserted()));
+  }
+
+  // Promote the non-voter replica again and then remove it when it's a voter,
+  // making sure it gets tombstoned.
+  {
+    ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::VOTER,
+                                      tablet_id, new_replica, kTimeout));
+    ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));
+    ASSERT_OK(RemoveReplica(tablet_id, new_replica, kTimeout));
+
+    // Verify the removed replica gets tombstoned.
+    const int new_replica_idx =
+        cluster_->tablet_server_index_by_uuid(new_replica->uuid());
+    ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
+        new_replica_idx, tablet_id, { TABLET_DATA_TOMBSTONED }, kTimeout));
+
+    // The removed tablet replica should be gone, and the master should report
+    // appropriate replica count at this point.
+    bool has_leader;
+    master::TabletLocationsPB tablet_locations;
+    ASSERT_OK(WaitForReplicasReportedToMaster(
+        cluster_->master_proxy(), kInitialReplicasNum, tablet_id, kTimeout,
+        WAIT_FOR_LEADER, &has_leader, &tablet_locations));
+    ASSERT_TRUE(has_leader);
+  }
+
+  NO_FATALS(cluster_->AssertNoCrashes());
+}
+
 }  // namespace tserver
 }  // namespace kudu


Mime
View raw message