kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jdcry...@apache.org
Subject [2/2] kudu git commit: KUDU-2101 Include a table summary at the bottom
Date Mon, 21 Aug 2017 17:25:20 GMT
KUDU-2101 Include a table summary at the bottom

This add a table summary to the bottom of ksck. For each table, the
table contains a status and information about the total number of
tablets and the number of healthy, under-replicated, and unhealthy
tablets. A tablet with consensus mismatch is counted as unhealthy, to
make the table easier for less experienced users to understand.

See ksck-test for sample output.

Change-Id: I1913352e3a1f91b4bb07e2f5001c8cc94d5155d4
Reviewed-on: http://gerrit.cloudera.org:8080/7707
Reviewed-by: Todd Lipcon <todd@apache.org>
Tested-by: Kudu Jenkins
Reviewed-by: Jean-Daniel Cryans <jdcryans@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/05542524
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/05542524
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/05542524

Branch: refs/heads/master
Commit: 055425245997a3b9a7615f14530164fad5a7ebb6
Parents: b4cb9ea
Author: Will Berkeley <wdberkeley@apache.org>
Authored: Thu Aug 17 12:08:47 2017 -0700
Committer: Jean-Daniel Cryans <jdcryans@apache.org>
Committed: Mon Aug 21 17:24:25 2017 +0000

----------------------------------------------------------------------
 src/kudu/tools/ksck-test.cc | 56 ++++++++++++++++++++++----
 src/kudu/tools/ksck.cc      | 86 +++++++++++++++++++++++++++++++---------
 src/kudu/tools/ksck.h       | 40 ++++++++++++++++---
 3 files changed, 150 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/05542524/src/kudu/tools/ksck-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index 4b02639..987cd8c 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -302,7 +302,7 @@ TEST_F(KsckTest, TestBadTabletServer) {
   ASSERT_TRUE(s.IsNetworkError()) << "Status returned: " << s.ToString();
 
   s = ksck_->CheckTablesConsistency();
-  EXPECT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  EXPECT_EQ("Corruption: 1 out of 1 table(s) are bad", s.ToString());
   ASSERT_STR_CONTAINS(
       err_stream_.str(),
       "WARNING: Unable to connect to Tablet Server "
@@ -381,6 +381,11 @@ TEST_F(KsckTest, TestOneSmallReplicatedTableWithConsensusState) {
   FLAGS_consensus = true;
   CreateOneSmallReplicatedTable();
   ASSERT_OK(RunKsck());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+      "Table Summary\n"
+      " Name | Status  | Total Tablets | Healthy | Under-replicated | Unavailable\n"
+      "------+---------+---------------+---------+------------------+-------------\n"
+      " test | HEALTHY | 3             | 3       | 0                | 0");
 }
 
 TEST_F(KsckTest, TestConsensusConflictExtraPeer) {
@@ -393,7 +398,7 @@ TEST_F(KsckTest, TestConsensusConflictExtraPeer) {
   cstate.mutable_committed_config()->add_peers()->set_permanent_uuid("ts-id-fake");
 
   Status s = RunKsck();
-  ASSERT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  ASSERT_EQ("Corruption: 1 out of 1 table(s) are bad", s.ToString());
   ASSERT_STR_CONTAINS(err_stream_.str(),
       "The consensus matrix is:\n"
       " Config source |      Voters      | Current term | Config index | Committed?\n"
@@ -402,6 +407,11 @@ TEST_F(KsckTest, TestConsensusConflictExtraPeer) {
       " A             | A*  B   C   D    | 0            |              | Yes\n"
       " B             | A*  B   C        | 0            |              | Yes\n"
       " C             | A*  B   C        | 0            |              | Yes");
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+      "Table Summary\n"
+      " Name |   Status    | Total Tablets | Healthy | Under-replicated | Unavailable\n"
+      "------+-------------+---------------+---------+------------------+-------------\n"
+      " test | UNAVAILABLE | 3             | 2       | 0                | 1");
 }
 
 TEST_F(KsckTest, TestConsensusConflictMissingPeer) {
@@ -414,7 +424,7 @@ TEST_F(KsckTest, TestConsensusConflictMissingPeer) {
   cstate.mutable_committed_config()->mutable_peers()->RemoveLast();
 
   Status s = RunKsck();
-  ASSERT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  ASSERT_EQ("Corruption: 1 out of 1 table(s) are bad", s.ToString());
   ASSERT_STR_CONTAINS(err_stream_.str(),
       "The consensus matrix is:\n"
       " Config source |    Voters    | Current term | Config index | Committed?\n"
@@ -423,6 +433,11 @@ TEST_F(KsckTest, TestConsensusConflictMissingPeer) {
       " A             | A*  B        | 0            |              | Yes\n"
       " B             | A*  B   C    | 0            |              | Yes\n"
       " C             | A*  B   C    | 0            |              | Yes");
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+      "Table Summary\n"
+      " Name |   Status    | Total Tablets | Healthy | Under-replicated | Unavailable\n"
+      "------+-------------+---------------+---------+------------------+-------------\n"
+      " test | UNAVAILABLE | 3             | 2       | 0                | 1");
 }
 
 TEST_F(KsckTest, TestConsensusConflictDifferentLeader) {
@@ -435,7 +450,7 @@ TEST_F(KsckTest, TestConsensusConflictDifferentLeader) {
   cstate.set_leader_uuid("ts-id-1");
 
   Status s = RunKsck();
-  ASSERT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  ASSERT_EQ("Corruption: 1 out of 1 table(s) are bad", s.ToString());
   ASSERT_STR_CONTAINS(err_stream_.str(),
       "The consensus matrix is:\n"
       " Config source |    Voters    | Current term | Config index | Committed?\n"
@@ -444,15 +459,25 @@ TEST_F(KsckTest, TestConsensusConflictDifferentLeader) {
       " A             | A   B*  C    | 0            |              | Yes\n"
       " B             | A*  B   C    | 0            |              | Yes\n"
       " C             | A*  B   C    | 0            |              | Yes");
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+      "Table Summary\n"
+      " Name |   Status    | Total Tablets | Healthy | Under-replicated | Unavailable\n"
+      "------+-------------+---------------+---------+------------------+-------------\n"
+      " test | UNAVAILABLE | 3             | 2       | 0                | 1");
 }
 
 TEST_F(KsckTest, TestOneOneTabletBrokenTable) {
   CreateOneOneTabletReplicatedBrokenTable();
   Status s = RunKsck();
-  EXPECT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  EXPECT_EQ("Corruption: 1 out of 1 table(s) are bad", s.ToString());
   ASSERT_STR_CONTAINS(err_stream_.str(),
                       "Tablet tablet-id-1 of table 'test' is under-replicated: "
                       "configuration has 2 replicas vs desired 3");
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+      "Table Summary\n"
+      " Name |      Status      | Total Tablets | Healthy | Under-replicated | Unavailable\n"
+      "------+------------------+---------------+---------+------------------+-------------\n"
+      " test | UNDER-REPLICATED | 1             | 0       | 1                | 0");
 }
 
 TEST_F(KsckTest, TestMismatchedAssignments) {
@@ -462,20 +487,25 @@ TEST_F(KsckTest, TestMismatchedAssignments) {
   ASSERT_EQ(1, ts->tablet_status_map_.erase("tablet-id-2"));
 
   Status s = RunKsck();
-  EXPECT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  EXPECT_EQ("Corruption: 1 out of 1 table(s) are bad", s.ToString());
   ASSERT_STR_CONTAINS(err_stream_.str(),
                       "Tablet tablet-id-2 of table 'test' is under-replicated: "
                       "1 replica(s) not RUNNING\n"
                       "  ts-id-0 (<mock>): missing [LEADER]\n"
                       "  ts-id-1 (<mock>): RUNNING\n"
                       "  ts-id-2 (<mock>): RUNNING\n");
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+      "Table Summary\n"
+      " Name |      Status      | Total Tablets | Healthy | Under-replicated | Unavailable\n"
+      "------+------------------+---------------+---------+------------------+-------------\n"
+      " test | UNDER-REPLICATED | 3             | 2       | 1                | 0");
 }
 
 TEST_F(KsckTest, TestTabletNotRunning) {
   CreateOneSmallReplicatedTableWithTabletNotRunning();
 
   Status s = RunKsck();
-  EXPECT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  EXPECT_EQ("Corruption: 1 out of 1 table(s) are bad", s.ToString());
   ASSERT_STR_CONTAINS(
       err_stream_.str(),
       "Tablet tablet-id-0 of table 'test' is unavailable: 3 replica(s) not RUNNING\n"
@@ -491,6 +521,11 @@ TEST_F(KsckTest, TestTabletNotRunning) {
       "    State:       FAILED\n"
       "    Data state:  TABLET_DATA_UNKNOWN\n"
       "    Last status: \n");
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+      "Table Summary\n"
+      " Name |   Status    | Total Tablets | Healthy | Under-replicated | Unavailable\n"
+      "------+-------------+---------------+---------+------------------+-------------\n"
+      " test | UNAVAILABLE | 3             | 2       | 0                | 1");
 }
 
 // Test for a bug where we weren't properly handling a tserver not reported by the master.
@@ -502,8 +537,13 @@ TEST_F(KsckTest, TestMissingTserver) {
   // tablets from other tablet servers are listing the missing tablet server as a peer.
   EraseKeyReturnValuePtr(&master_->tablet_servers_, "ts-id-0");
   Status s = RunKsck();
-  ASSERT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  ASSERT_EQ("Corruption: 1 out of 1 table(s) are bad", s.ToString());
   ASSERT_STR_CONTAINS(err_stream_.str(), "Table test has 3 under-replicated tablet(s)");
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+      "Table Summary\n"
+      " Name |      Status      | Total Tablets | Healthy | Under-replicated | Unavailable\n"
+      "------+------------------+---------------+---------+------------------+-------------\n"
+      " test | UNDER-REPLICATED | 3             | 0       | 3                | 0");
 }
 
 } // namespace tools

http://git-wip-us.apache.org/repos/asf/kudu/blob/05542524/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index f3376cd..6b2f8b3 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -69,6 +69,7 @@ using std::setw;
 using std::shared_ptr;
 using std::string;
 using std::stringstream;
+using std::to_string;
 using std::unordered_map;
 using std::vector;
 using strings::Substitute;
@@ -206,32 +207,62 @@ Status Ksck::ConnectToTabletServer(const shared_ptr<KsckTabletServer>&
ts) {
 }
 
 Status Ksck::CheckTablesConsistency() {
-  int tables_checked = 0;
   int bad_tables_count = 0;
+  vector<TableSummary> table_summaries;
   for (const shared_ptr<KsckTable> &table : cluster_->tables()) {
     if (!MatchesAnyPattern(table_filters_, table->name())) {
       VLOG(1) << "Skipping table " << table->name();
       continue;
     }
-    tables_checked++;
-    if (!VerifyTable(table)) {
+    TableSummary ts;
+    ts.name = table->name();
+    if (!VerifyTable(table, &ts)) {
       bad_tables_count++;
     }
+    table_summaries.emplace_back(std::move(ts));
     Out() << endl;
   }
 
-  if (tables_checked == 0) {
+  if (table_summaries.empty()) {
     Out() << "The cluster doesn't have any matching tables" << endl;
     return Status::OK();
   }
 
+  // Show unhealthy tablets at the bottom so they're easier to see;
+  // otherwise sort alphabetically.
+  std::sort(table_summaries.begin(), table_summaries.end(),
+            [](const TableSummary& left, const TableSummary& right) {
+              return std::make_pair(left.TableStatus() != CheckResult::OK, left.name) <
+                     std::make_pair(right.TableStatus() != CheckResult::OK, right.name);
+            });
+  Out() << "Table Summary" << endl;
+  DataTable table({ "Name", "Status", "Total Tablets",
+                    "Healthy", "Under-replicated", "Unavailable"});
+  for (const TableSummary& ts : table_summaries) {
+    string status;
+    switch (ts.TableStatus()) {
+      case CheckResult::OK:
+        status = "HEALTHY";
+        break;
+      case CheckResult::UNDER_REPLICATED:
+        status = "UNDER-REPLICATED";
+        break;
+      default:
+        status = "UNAVAILABLE";
+        break;
+    }
+    table.AddRow({ ts.name, status, to_string(ts.TotalTablets()),
+                   to_string(ts.healthy_tablets), to_string(ts.underreplicated_tablets),
+                   to_string(ts.consensus_mismatch_tablets + ts.unavailable_tablets) });
+  }
+  CHECK_OK(table.PrintTo(Out()));
+
   if (bad_tables_count == 0) {
-    Out() << Substitute("The metadata for $0 table(s) is HEALTHY", tables_checked)
<< endl;
+    Out() << Substitute("The metadata for $0 table(s) is HEALTHY", table_summaries.size())
<< endl;
     return Status::OK();
   }
-  Warn() << Substitute("$0 out of $1 table(s) are not in a healthy state",
-                       bad_tables_count, tables_checked) << endl;
-  return Status::Corruption(Substitute("$0 table(s) are bad", bad_tables_count));
+  return Status::Corruption(Substitute("$0 out of $1 table(s) are bad",
+                                       bad_tables_count, table_summaries.size()));
 }
 
 // Class to act as a collector of scan results.
@@ -541,7 +572,7 @@ Status Ksck::ChecksumData(const ChecksumOptions& opts) {
   return Status::OK();
 }
 
-bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table) {
+bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table, TableSummary* ts) {
   const auto all_tablets = table->tablets();
   vector<shared_ptr<KsckTablet>> tablets;
   std::copy_if(all_tablets.begin(), all_tablets.end(), std::back_inserter(tablets),
@@ -552,30 +583,47 @@ bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table)
{
   int table_num_replicas = table->num_replicas();
   VLOG(1) << Substitute("Verifying $0 tablet(s) for table $1 configured with num_replicas
= $2",
                         tablets.size(), table->name(), table_num_replicas);
-
-  map<CheckResult, int> result_counts;
   for (const auto& tablet : tablets) {
     auto tablet_result = VerifyTablet(tablet, table_num_replicas);
-    result_counts[tablet_result]++;
+    switch (tablet_result) {
+      case CheckResult::OK:
+        ts->healthy_tablets++;
+        break;
+      case CheckResult::UNDER_REPLICATED:
+        ts->underreplicated_tablets++;
+        break;
+      case CheckResult::CONSENSUS_MISMATCH:
+        ts->consensus_mismatch_tablets++;
+        break;
+      case CheckResult::UNAVAILABLE:
+        ts->unavailable_tablets++;
+        break;
+    }
   }
-  if (result_counts[CheckResult::OK] == tablets.size()) {
+  if (ts->healthy_tablets == tablets.size()) {
     Out() << Substitute("Table $0 is $1 ($2 tablet(s) checked)",
                         table->name(),
                         Color(AnsiCode::GREEN, "HEALTHY"),
                         tablets.size()) << endl;
     return true;
   }
-  if (result_counts[CheckResult::UNAVAILABLE] > 0) {
+  if (ts->underreplicated_tablets > 0) {
     Out() << Substitute("Table $0 has $1 $2 tablet(s)",
                         table->name(),
-                        result_counts[CheckResult::UNAVAILABLE],
-                        Color(AnsiCode::RED, "unavailable")) << endl;
+                        ts->underreplicated_tablets,
+                        Color(AnsiCode::YELLOW, "under-replicated")) << endl;
   }
-  if (result_counts[CheckResult::UNDER_REPLICATED] > 0) {
+  if (ts->consensus_mismatch_tablets > 0) {
+    Out() << Substitute("Table $0 has $1 tablet(s) $2",
+                        table->name(),
+                        ts->consensus_mismatch_tablets,
+                        Color(AnsiCode::YELLOW, "with mismatched consensus")) << endl;
+  }
+  if (ts->unavailable_tablets > 0) {
     Out() << Substitute("Table $0 has $1 $2 tablet(s)",
                         table->name(),
-                        result_counts[CheckResult::UNDER_REPLICATED],
-                        Color(AnsiCode::YELLOW, "under-replicated")) << endl;
+                        ts->unavailable_tablets,
+                        Color(AnsiCode::RED, "unavailable")) << endl;
   }
   return false;
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/05542524/src/kudu/tools/ksck.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index f367914..ea76c90 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -424,10 +424,11 @@ class Ksck {
   // Must first call FetchTableAndTabletInfo().
   Status ConnectToTabletServer(const std::shared_ptr<KsckTabletServer>& ts);
 
-  // Verifies that all the tables have contiguous tablets and that each tablet has enough
replicas
-  // and a leader.
-  // Must first call FetchTableAndTabletInfo() and, if doing checks againt tablet
-  // servers (the default), must first call FetchInfoFromTabletServers().
+  // Verifies that all the tablets in all tables matching the filters have
+  // enough replicas, and that each tablet's view of the tablet's consensus
+  // matches every other tablet's and the master's.
+  // Must first call FetchTableAndTabletInfo() and, if doing checks against
+  // tablet servers (the default), must first call FetchInfoFromTabletServers().
   Status CheckTablesConsistency();
 
   // Verifies data checksums on all tablets by doing a scan of the database on each replica.
@@ -443,7 +444,36 @@ class Ksck {
     CONSENSUS_MISMATCH,
   };
 
-  bool VerifyTable(const std::shared_ptr<KsckTable>& table);
+  // Summarizes the result of VerifyTable().
+  struct TableSummary {
+    std::string name;
+    int healthy_tablets = 0;
+    int underreplicated_tablets = 0;
+    int consensus_mismatch_tablets = 0;
+    int unavailable_tablets = 0;
+
+    int TotalTablets() const {
+      return healthy_tablets + underreplicated_tablets +
+          consensus_mismatch_tablets + unavailable_tablets;
+    }
+
+    // Summarize the table's status with a tablet CheckResult.
+    // A table's status is determined by the health of the least healthy tablet.
+    CheckResult TableStatus() const {
+      if (unavailable_tablets > 0) {
+        return CheckResult::UNAVAILABLE;
+      }
+      if (consensus_mismatch_tablets > 0) {
+        return CheckResult::CONSENSUS_MISMATCH;
+      }
+      if (underreplicated_tablets > 0) {
+        return CheckResult::UNDER_REPLICATED;
+      }
+      return CheckResult::OK;
+    }
+  };
+
+  bool VerifyTable(const std::shared_ptr<KsckTable>& table, TableSummary* ts);
   bool VerifyTableWithTimeout(const std::shared_ptr<KsckTable>& table,
                               const MonoDelta& timeout,
                               const MonoDelta& retry_interval);


Mime
View raw message