kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jdcry...@apache.org
Subject [3/3] kudu git commit: Avoid extra fsyncs of tombstoned tablets during startup
Date Tue, 08 Nov 2016 16:54:15 GMT
Avoid extra fsyncs of tombstoned tablets during startup

At startup, we roll-forward the deletion of any tombstoned tablets. This
is often redundant if the tablet is already fully deleted (i.e. no
blocks, no orphaned blocks, and no WAL). In that case, we can skip the
expensive fsync to speed up startup.

Before this patch, a server with a few thousand tombstoned tablets took
~6 minutes to start up (about ~100ms per tablet). After the patch, it
only took 1 second.

This patch also removes scary-looking 'WARNING' messages being printed
for every tombstoned tablet at startup.

Change-Id: I60cb184b8de2a6a381371ddcf2fb938a19757eec
Reviewed-on: http://gerrit.cloudera.org:8080/4915
Tested-by: Kudu Jenkins
Reviewed-by: Dinesh Bhat <dinesh@cloudera.com>
Reviewed-by: Jean-Daniel Cryans <jdcryans@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/141de337
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/141de337
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/141de337

Branch: refs/heads/master
Commit: 141de3377e6bdd95b8a455d262c00066f23bc021
Parents: c7090f9
Author: Todd Lipcon <todd@apache.org>
Authored: Wed Nov 2 11:49:05 2016 -0700
Committer: Jean-Daniel Cryans <jdcryans@apache.org>
Committed: Tue Nov 8 16:53:48 2016 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus_meta.cc            |  1 +
 src/kudu/consensus/log.cc                       |  6 +++++
 src/kudu/consensus/log.h                        |  3 +++
 src/kudu/integration-tests/delete_table-test.cc | 11 ++++++++-
 .../external_mini_cluster_fs_inspector.cc       | 25 ++++++++++++++++----
 .../external_mini_cluster_fs_inspector.h        |  8 +++++++
 src/kudu/tablet/tablet_metadata.cc              |  7 ++++++
 src/kudu/tablet/tablet_metadata.h               |  5 ++++
 src/kudu/tserver/ts_tablet_manager.cc           | 17 +++++++++----
 9 files changed, 73 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/consensus/consensus_meta.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_meta.cc b/src/kudu/consensus/consensus_meta.cc
index cdb4186..34f4506 100644
--- a/src/kudu/consensus/consensus_meta.cc
+++ b/src/kudu/consensus/consensus_meta.cc
@@ -68,6 +68,7 @@ Status ConsensusMetadata::DeleteOnDiskData(FsManager* fs_manager, const
string&
   if (!env->FileExists(cmeta_path)) {
     return Status::OK();
   }
+  LOG(INFO) << "T " << tablet_id << " Deleting consensus metadata";
   RETURN_NOT_OK_PREPEND(env->DeleteFile(cmeta_path),
                         "Unable to delete consensus metadata file for tablet " + tablet_id);
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/consensus/log.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/log.cc b/src/kudu/consensus/log.cc
index 4225c07..42206c1 100644
--- a/src/kudu/consensus/log.cc
+++ b/src/kudu/consensus/log.cc
@@ -829,12 +829,18 @@ Status Log::Close() {
   }
 }
 
+bool Log::HasOnDiskData(FsManager* fs_manager, const string& tablet_id) {
+  string wal_dir = fs_manager->GetTabletWalDir(tablet_id);
+  return fs_manager->env()->FileExists(wal_dir);
+}
+
 Status Log::DeleteOnDiskData(FsManager* fs_manager, const string& tablet_id) {
   string wal_dir = fs_manager->GetTabletWalDir(tablet_id);
   Env* env = fs_manager->env();
   if (!env->FileExists(wal_dir)) {
     return Status::OK();
   }
+  LOG(INFO) << "T " << tablet_id << " Deleting WAL directory";
   RETURN_NOT_OK_PREPEND(env->DeleteRecursively(wal_dir),
                         "Unable to recursively delete WAL dir for tablet " + tablet_id);
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/consensus/log.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/log.h b/src/kudu/consensus/log.h
index 46c573c..f947b90 100644
--- a/src/kudu/consensus/log.h
+++ b/src/kudu/consensus/log.h
@@ -148,6 +148,9 @@ class Log : public RefCountedThreadSafe<Log> {
   // Syncs all state and closes the log.
   Status Close();
 
+  // Return true if there is any on-disk data for the given tablet.
+  static bool HasOnDiskData(FsManager* fs_manager, const std::string& tablet_id);
+
   // Delete all WAL data from the log associated with this tablet.
   // REQUIRES: The Log must be closed.
   static Status DeleteOnDiskData(FsManager* fs_manager, const std::string& tablet_id);

http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/integration-tests/delete_table-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc
index e48d917..fd1803d 100644
--- a/src/kudu/integration-tests/delete_table-test.cc
+++ b/src/kudu/integration-tests/delete_table-test.cc
@@ -1196,7 +1196,6 @@ TEST_P(DeleteTableTombstonedParamTest, TestTabletTombstone) {
   ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart());
   LOG(INFO) << "Waiting for second tablet to be tombstoned...";
   NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_EXPECTED));
-
   // The tombstoned tablets will still show up in ListTablets(),
   // just with their data state set as TOMBSTONED. They should also be listed
   // as NOT_STARTED because we restarted the server.
@@ -1207,6 +1206,16 @@ TEST_P(DeleteTableTombstonedParamTest, TestTabletTombstone) {
         << t.tablet_status().tablet_id() << " not tombstoned";
   }
 
+  // Check that, upon restart of the tablet server with a tombstoned tablet,
+  // we don't unnecessary "roll forward" and rewrite the tablet metadata file
+  // when it is already fully deleted.
+  int64_t orig_mtime = inspect_->GetTabletSuperBlockMTimeOrDie(kTsIndex, tablet_id);
+  cluster_->tablet_server(kTsIndex)->Shutdown();
+  ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart());
+  int64_t new_mtime = inspect_->GetTabletSuperBlockMTimeOrDie(kTsIndex, tablet_id);
+  ASSERT_EQ(orig_mtime, new_mtime)
+      << "Tablet superblock should not have been re-flushed unnecessarily";
+
   // Finally, delete all tablets on the TS, and wait for all data to be gone.
   LOG(INFO) << "Deleting all tablets...";
   for (const ListTabletsResponsePB::StatusAndSchemaPB& tablet : tablets) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
index 19262ca..4ddbdbf 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
@@ -17,6 +17,10 @@
 
 #include "kudu/integration-tests/external_mini_cluster_fs_inspector.h"
 
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
 #include <algorithm>
 #include <set>
 
@@ -166,13 +170,26 @@ Status ExternalMiniClusterFsInspector::CheckNoData() {
   return Status::OK();;
 }
 
+string ExternalMiniClusterFsInspector::GetTabletSuperBlockPathOnTS(int ts_index,
+                                                                   const string& tablet_id)
const {
+  string data_dir = cluster_->tablet_server(ts_index)->data_dir();
+  string meta_dir = JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName);
+  return JoinPathSegments(meta_dir, tablet_id);
+}
+
 Status ExternalMiniClusterFsInspector::ReadTabletSuperBlockOnTS(int index,
                                                                 const string& tablet_id,
                                                                 TabletSuperBlockPB* sb) {
-  string data_dir = cluster_->tablet_server(index)->data_dir();
-  string meta_dir = JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName);
-  string superblock_path = JoinPathSegments(meta_dir, tablet_id);
-  return pb_util::ReadPBContainerFromPath(env_, superblock_path, sb);
+  const auto& sb_path = GetTabletSuperBlockPathOnTS(index, tablet_id);
+  return pb_util::ReadPBContainerFromPath(env_, sb_path, sb);
+}
+
+int64_t ExternalMiniClusterFsInspector::GetTabletSuperBlockMTimeOrDie(
+    int ts_index, const std::string& tablet_id) {
+  const auto& sb_path = GetTabletSuperBlockPathOnTS(ts_index, tablet_id);
+  struct stat s;
+  CHECK_ERR(stat(sb_path.c_str(), &s)) << "failed to stat: " << sb_path;
+  return s.st_mtim.tv_sec * 1e6 + s.st_mtim.tv_nsec / 1000;
 }
 
 string ExternalMiniClusterFsInspector::GetConsensusMetadataPathOnTS(int index,

http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
index 6616498..25af8f0 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
@@ -80,6 +80,11 @@ class ExternalMiniClusterFsInspector {
 
   Status ReadTabletSuperBlockOnTS(int index, const std::string& tablet_id,
                                   tablet::TabletSuperBlockPB* sb);
+
+  // Get the modification time (in micros) of the tablet superblock for the given tablet
+  // server index and tablet ID.
+  int64_t GetTabletSuperBlockMTimeOrDie(int ts_index, const std::string& tablet_id);
+
   Status ReadConsensusMetadataOnTS(int index, const std::string& tablet_id,
                                    consensus::ConsensusMetadataPB* cmeta_pb);
   Status WriteConsensusMetadataOnTS(int index,
@@ -124,6 +129,9 @@ class ExternalMiniClusterFsInspector {
   std::string GetConsensusMetadataPathOnTS(int index,
                                            const std::string& tablet_id) const;
 
+  std::string GetTabletSuperBlockPathOnTS(int ts_index,
+                                          const std::string& tablet_id) const;
+
   Env* const env_;
   ExternalMiniCluster* const cluster_;
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index 0771c51..a0ad971 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -187,6 +187,13 @@ Status TabletMetadata::DeleteTabletData(TabletDataState delete_type,
   return Flush();
 }
 
+bool TabletMetadata::IsTombstonedWithNoBlocks() const {
+  std::lock_guard<LockType> l(data_lock_);
+  return tablet_data_state_ == TABLET_DATA_TOMBSTONED &&
+      rowsets_.empty() &&
+      orphaned_blocks_.empty();
+}
+
 Status TabletMetadata::DeleteSuperBlock() {
   std::lock_guard<LockType> l(data_lock_);
   if (!orphaned_blocks_.empty()) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/tablet/tablet_metadata.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.h b/src/kudu/tablet/tablet_metadata.h
index 45341a3..383bae4 100644
--- a/src/kudu/tablet/tablet_metadata.h
+++ b/src/kudu/tablet/tablet_metadata.h
@@ -185,6 +185,11 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata>
{
   Status DeleteTabletData(TabletDataState delete_type,
                           const boost::optional<consensus::OpId>& last_logged_opid);
 
+  // Return true if this metadata references no blocks (either live or orphaned) and is
+  // already marked as tombstoned. If this is the case, then calling DeleteTabletData
+  // would be a no-op.
+  bool IsTombstonedWithNoBlocks() const;
+
   // Permanently deletes the superblock from the disk.
   // DeleteTabletData() must first be called and the tablet data state must be
   // TABLET_DATA_DELETED.

http://git-wip-us.apache.org/repos/asf/kudu/blob/141de337/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index c3ba3c7..bcce21c 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -887,17 +887,24 @@ Status TSTabletManager::HandleNonReadyTabletOnStartup(const scoped_refptr<Tablet
       << "Unexpected TabletDataState in tablet " << tablet_id << ": "
       << TabletDataState_Name(data_state) << " (" << data_state <<
")";
 
-  // Roll forward deletions, as needed.
-  LOG(WARNING) << LogPrefix(tablet_id) << "Tablet Manager startup: Rolling forward
tablet deletion "
-               << "of type " << TabletDataState_Name(data_state);
+  // If the tablet is already fully tombstoned with no remaining data or WAL,
+  // then no need to roll anything forward.
+  bool skip_deletion = meta->IsTombstonedWithNoBlocks() &&
+      !Log::HasOnDiskData(meta->fs_manager(), tablet_id);
+
+  LOG_IF(WARNING, !skip_deletion)
+      << LogPrefix(tablet_id) << "Tablet Manager startup: Rolling forward tablet
deletion "
+      << "of type " << TabletDataState_Name(data_state);
 
   if (data_state == TABLET_DATA_COPYING) {
     // We tombstone tablets that failed to copy.
     data_state = TABLET_DATA_TOMBSTONED;
   }
 
-  // Passing no OpId will retain the last_logged_opid that was previously in the metadata.
-  RETURN_NOT_OK(DeleteTabletData(meta, data_state, boost::none));
+  if (!skip_deletion) {
+    // Passing no OpId will retain the last_logged_opid that was previously in the metadata.
+    RETURN_NOT_OK(DeleteTabletData(meta, data_state, boost::none));
+  }
 
   // Register TOMBSTONED tablets so that they get reported to the Master, which
   // allows us to permanently delete replica tombstones when a table gets


Mime
View raw message