kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ale...@apache.org
Subject [kudu] 01/03: KUDU-2711 (part 3). Only send one TSInfoPB per server in GetTableLocations
Date Tue, 07 May 2019 05:52:28 GMT
This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 586e957f76a547340f2ab93a7eebc3f116ff825e
Author: Todd Lipcon <todd@apache.org>
AuthorDate: Tue Feb 26 11:55:39 2019 -0800

    KUDU-2711 (part 3). Only send one TSInfoPB per server in GetTableLocations
    
    This changes the response for GetTableLocations to send a top-level list
    of TSInfoPB, and then each replica refers into that list by index. This
    avoids having to copy a TSInfoPB per replica, and instead bounds the
    number of serializations to the number of TS in the cluster. In the
    worst case, this may slightly regress performance if the number of
    replicas in the response is much fewer than the number of TS in the
    cluster, since there is some extra overhead of building the index
    mapping. In the best case (large number of replicas in the response)
    there is a big improvement.
    
    To handle wire compatibility, the server only uses the new response
    format in the case that the client indicates it expects it by passing
    a new boolean in the request.
    
    The client handles old servers by handling a response in either format.
    
    This patch only implements the new protocol in the C++ client. The Java client
    will be updated in a follow-up patch.
    
    Benchmarked a ~5x improvement with:
    
      table_locations-itest --gtest_filter=\*Bench\*  \
        --benchmark_num_tablets 300 --benchmark_num_threads 30 \
        --benchmark_runtime_secs=10
    
    with rwlock (previous patch):
      Count: 22179
      Mean: 3901.19
      Percentiles:
         0%  (min) = 52
        25%        = 3600
        50%  (med) = 3904
        75%        = 4192
        95%        = 4608
        99%        = 4896
        99.9%      = 5760
        99.99%     = 10048
        100% (max) = 10661
    
    with these improvements:
      Count: 109590
      Mean: 569.492
      Percentiles:
         0%  (min) = 31
        25%        = 512
        50%  (med) = 556
        75%        = 604
        95%        = 716
        99%        = 912
        99.9%      = 1232
        99.99%     = 2528
        100% (max) = 6336
    
    Change-Id: Ief65d0825e919f681b7ade6a856c103201f8305c
    Reviewed-on: http://gerrit.cloudera.org:8080/12615
    Reviewed-by: Adar Dembo <adar@cloudera.com>
    Tested-by: Will Berkeley <wdberkeley@gmail.com>
---
 .../org/apache/kudu/client/AsyncKuduClient.java    |  1 +
 src/kudu/client/meta_cache.cc                      | 62 +++++++++++++++-----
 src/kudu/client/meta_cache.h                       |  9 +--
 src/kudu/consensus/quorum_util-test.cc             |  6 ++
 src/kudu/consensus/quorum_util.cc                  | 23 +++++++-
 src/kudu/consensus/quorum_util.h                   |  8 +++
 src/kudu/integration-tests/registration-test.cc    |  6 +-
 .../integration-tests/table_locations-itest.cc     | 39 +++++++++++++
 src/kudu/master/catalog_manager.cc                 | 68 +++++++++++++++-------
 src/kudu/master/catalog_manager.h                  | 31 ++++++++--
 src/kudu/master/master.proto                       | 25 ++++++++
 src/kudu/master/master_service.cc                  |  4 +-
 12 files changed, 233 insertions(+), 49 deletions(-)

diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
index 8f666ed..afc54d1 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
@@ -2160,6 +2160,7 @@ public class AsyncKuduClient implements AutoCloseable {
                        int requestedBatchSize,
                        List<Master.TabletLocationsPB> locations,
                        long ttl) throws KuduException {
+    // TODO(todd): handle "interned" response here
     String tableId = table.getTableId();
     String tableName = table.getName();
 
diff --git a/src/kudu/client/meta_cache.cc b/src/kudu/client/meta_cache.cc
index 1ac130f..30c144b 100644
--- a/src/kudu/client/meta_cache.cc
+++ b/src/kudu/client/meta_cache.cc
@@ -71,6 +71,7 @@ using master::GetTableLocationsRequestPB;
 using master::GetTableLocationsResponsePB;
 using master::MasterServiceProxy;
 using master::TabletLocationsPB;
+using master::TabletLocationsPB_InternedReplicaPB;
 using master::TabletLocationsPB_ReplicaPB;
 using master::TSInfoPB;
 using rpc::BackoffType;
@@ -184,20 +185,40 @@ void RemoteTabletServer::GetHostPorts(vector<HostPort>* host_ports)
const {
 ////////////////////////////////////////////////////////////
 
 
-void RemoteTablet::Refresh(const TabletServerMap& tservers,
-                           const google::protobuf::RepeatedPtrField
-                             <TabletLocationsPB_ReplicaPB>& replicas) {
+Status RemoteTablet::Refresh(
+    const TabletServerMap& tservers,
+    const TabletLocationsPB& locs_pb,
+    const google::protobuf::RepeatedPtrField<TSInfoPB>& ts_info_dict) {
+
+  vector<std::pair<string, consensus::RaftPeerPB::Role>> uuid_and_role;
+
+  for (const TabletLocationsPB_ReplicaPB& r : locs_pb.replicas()) {
+    uuid_and_role.emplace_back(r.ts_info().permanent_uuid(), r.role());
+  }
+  for (const TabletLocationsPB_InternedReplicaPB& r : locs_pb.interned_replicas()) {
+    if (r.ts_info_idx() >= ts_info_dict.size()) {
+      return Status::Corruption(Substitute(
+          "invalid response from master: referenced tablet idx $0 but only $1 present",
+          r.ts_info_idx(), ts_info_dict.size()));
+    }
+    const TSInfoPB& ts_info = ts_info_dict.Get(r.ts_info_idx());
+    uuid_and_role.emplace_back(ts_info.permanent_uuid(), r.role());
+  }
+
   // Adopt the data from the successful response.
   std::lock_guard<simple_spinlock> l(lock_);
   replicas_.clear();
-  for (const TabletLocationsPB_ReplicaPB& r : replicas) {
+
+  for (const auto& p : uuid_and_role) {
     RemoteReplica rep;
-    rep.ts = FindOrDie(tservers, r.ts_info().permanent_uuid());
-    rep.role = r.role();
+    rep.ts = FindOrDie(tservers, p.first);
+    rep.role = p.second;
     rep.failed = false;
-    replicas_.push_back(rep);
+    replicas_.emplace_back(rep);
   }
+
   stale_ = false;
+  return Status::OK();
 }
 
 void RemoteTablet::MarkStale() {
@@ -683,6 +704,7 @@ void LookupRpc::SendRpcSlowPath() {
   req_.mutable_table()->set_table_id(table_->id());
   req_.set_partition_key_start(partition_key_);
   req_.set_max_returned_locations(locations_to_fetch());
+  req_.set_intern_ts_infos_in_response(true);
   if (replica_visibility_ == ReplicaController::Visibility::ALL) {
     req_.set_replica_type_filter(master::ANY_REPLICA);
   }
@@ -778,6 +800,18 @@ Status MetaCache::ProcessLookupResponse(const LookupRpc& rpc,
     VLOG(3) << "Caching '" << rpc.table_name() << "' entry " << entry.DebugString(rpc.table());
     InsertOrDie(&tablets_by_key, "", entry);
   } else {
+    // First, update the tserver cache, needed for the Refresh calls below.
+    for (const TabletLocationsPB& tablet : tablet_locations) {
+      for (const TabletLocationsPB_ReplicaPB& replicas : tablet.replicas()) {
+        UpdateTabletServer(replicas.ts_info());
+      }
+    }
+    // In the case of "interned" replicas, the above 'replicas' lists will be empty
+    // and instead we'll need to update from the top-level list of tservers.
+    const auto& ts_infos = rpc.resp().ts_infos();
+    for (const TSInfoPB& ts_info : ts_infos) {
+      UpdateTabletServer(ts_info);
+    }
 
     // The comments below will reference the following diagram:
     //
@@ -832,11 +866,6 @@ Status MetaCache::ProcessLookupResponse(const LookupRpc& rpc,
       // and the entry TTL. If the tablet is unknown, then we need to create a
       // new RemoteTablet for it.
 
-      // First, update the tserver cache, needed for the Refresh calls below.
-      for (const TabletLocationsPB_ReplicaPB& replicas : tablet.replicas()) {
-        UpdateTabletServer(replicas.ts_info());
-      }
-
       const string& tablet_id = tablet.tablet_id();
       scoped_refptr<RemoteTablet> remote = FindPtrOrNull(tablets_by_id_, tablet_id);
       if (remote.get() != nullptr) {
@@ -846,8 +875,9 @@ Status MetaCache::ProcessLookupResponse(const LookupRpc& rpc,
 
         VLOG(3) << "Refreshing tablet " << tablet_id << ": "
                 << pb_util::SecureShortDebugString(tablet);
-        remote->Refresh(ts_cache_, tablet.replicas());
-
+        RETURN_NOT_OK_PREPEND(remote->Refresh(ts_cache_, tablet, ts_infos),
+                              Substitute("failed to refresh locations for tablet $0",
+                                         tablet_id));
         // Update the entry TTL.
         auto& entry = FindOrDie(tablets_by_key, tablet_lower_bound);
         DCHECK(!entry.is_non_covered_range() &&
@@ -864,7 +894,9 @@ Status MetaCache::ProcessLookupResponse(const LookupRpc& rpc,
       Partition partition;
       Partition::FromPB(tablet.partition(), &partition);
       remote = new RemoteTablet(tablet_id, partition);
-      remote->Refresh(ts_cache_, tablet.replicas());
+      RETURN_NOT_OK_PREPEND(remote->Refresh(ts_cache_, tablet, ts_infos),
+                            Substitute("failed to refresh locations for tablet $0",
+                                       tablet_id));
 
       MetaCacheEntry entry(expiration_time, remote);
       VLOG(3) << "Caching '" << rpc.table_name() << "' entry " <<
entry.DebugString(rpc.table());
diff --git a/src/kudu/client/meta_cache.h b/src/kudu/client/meta_cache.h
index 83c104c..c645f54 100644
--- a/src/kudu/client/meta_cache.h
+++ b/src/kudu/client/meta_cache.h
@@ -53,8 +53,8 @@ class TabletServerServiceProxy;
 } // namespace tserver
 
 namespace master {
-class TabletLocationsPB_ReplicaPB;
 class TSInfoPB;
+class TabletLocationsPB;
 } // namespace master
 
 namespace client {
@@ -201,9 +201,10 @@ class RemoteTablet : public RefCountedThreadSafe<RemoteTablet>
{
   }
 
   // Updates this tablet's replica locations.
-  void Refresh(const TabletServerMap& tservers,
-               const google::protobuf::RepeatedPtrField
-                 <master::TabletLocationsPB_ReplicaPB>& replicas);
+  Status Refresh(
+      const TabletServerMap& tservers,
+      const master::TabletLocationsPB& locs_pb,
+      const google::protobuf::RepeatedPtrField<master::TSInfoPB>& ts_info_dict);
 
   // Mark this tablet as stale, indicating that the cached tablet metadata is
   // out of date. Staleness is checked by the MetaCache when
diff --git a/src/kudu/consensus/quorum_util-test.cc b/src/kudu/consensus/quorum_util-test.cc
index a32e368..20d6393 100644
--- a/src/kudu/consensus/quorum_util-test.cc
+++ b/src/kudu/consensus/quorum_util-test.cc
@@ -307,6 +307,12 @@ TEST(QuorumUtilTest, TestGetConsensusRole) {
   ASSERT_EQ(NON_PARTICIPANT, GetConsensusRole("D", cstate));
   cstate.set_leader_uuid("D");
   ASSERT_EQ(NON_PARTICIPANT, GetConsensusRole("D", cstate)); // Illegal.
+
+  // Test GetParticipantRole() on the participants in the config.
+  for (const auto& peer : config2.peers()) {
+    ASSERT_EQ(GetParticipantRole(peer, cstate),
+              GetConsensusRole(peer.permanent_uuid(), cstate));
+  }
 }
 
 TEST(QuorumUtilTest, TestIsRaftConfigVoter) {
diff --git a/src/kudu/consensus/quorum_util.cc b/src/kudu/consensus/quorum_util.cc
index 88a231c..6a2745b 100644
--- a/src/kudu/consensus/quorum_util.cc
+++ b/src/kudu/consensus/quorum_util.cc
@@ -154,11 +154,30 @@ RaftPeerPB::Role GetConsensusRole(const std::string& peer_uuid,
                                   const ConsensusStatePB& cstate) {
   // The active config is the pending config if there is one, else it's the committed config.
   const RaftConfigPB& config = cstate.has_pending_config() ?
-                               cstate.pending_config() :
-                               cstate.committed_config();
+      cstate.pending_config() :
+      cstate.committed_config();
   return GetConsensusRole(peer_uuid, cstate.leader_uuid(), config);
 }
 
+
+RaftPeerPB::Role GetParticipantRole(const RaftPeerPB& peer,
+                                    const ConsensusStatePB& cstate) {
+  const auto& peer_uuid = peer.permanent_uuid();
+  DCHECK_NE(RaftPeerPB::NON_PARTICIPANT,
+            GetConsensusRole(peer_uuid, cstate))
+      << "Peer " << peer_uuid << " << not a participant in " <<
cstate.ShortDebugString();
+
+  switch (peer.member_type()) {
+    case RaftPeerPB::VOTER:
+      if (peer_uuid == cstate.leader_uuid()) {
+        return RaftPeerPB::LEADER;
+      }
+      return RaftPeerPB::FOLLOWER;
+    default:
+      return RaftPeerPB::LEARNER;
+  }
+}
+
 Status VerifyRaftConfig(const RaftConfigPB& config) {
   std::set<string> uuids;
   if (config.peers().empty()) {
diff --git a/src/kudu/consensus/quorum_util.h b/src/kudu/consensus/quorum_util.h
index a4cc095..4cd30ba 100644
--- a/src/kudu/consensus/quorum_util.h
+++ b/src/kudu/consensus/quorum_util.h
@@ -95,6 +95,14 @@ RaftPeerPB::Role GetConsensusRole(const std::string& peer_uuid,
 RaftPeerPB::Role GetConsensusRole(const std::string& peer_uuid,
                                   const ConsensusStatePB& cstate);
 
+
+// Same as above, but requires that the given 'peer' is a participant
+// in the active configuration in specified consensus state.
+// If not, it will return incorrect results.
+RaftPeerPB::Role GetParticipantRole(const RaftPeerPB& peer,
+                                    const ConsensusStatePB& cstate);
+
+
 // Verifies that the provided configuration is well formed.
 Status VerifyRaftConfig(const RaftConfigPB& config);
 
diff --git a/src/kudu/integration-tests/registration-test.cc b/src/kudu/integration-tests/registration-test.cc
index 74fcf36..6e6eb36 100644
--- a/src/kudu/integration-tests/registration-test.cc
+++ b/src/kudu/integration-tests/registration-test.cc
@@ -200,7 +200,11 @@ class RegistrationTest : public KuduTest {
           break;  // exiting out of the 'do {...} while (false)' scope
         }
         RETURN_NOT_OK(ls);
-        s = catalog->GetTabletLocations(tablet_id, master::VOTER_REPLICA, &loc, /*user=*/none);
+        s = catalog->GetTabletLocations(tablet_id,
+                                        master::VOTER_REPLICA,
+                                        &loc,
+                                        /*ts_infos_dict=*/nullptr,
+                                        /*user=*/none);
       } while (false);
       if (s.ok() && loc.replicas_size() == expected_count) {
         if (locations) {
diff --git a/src/kudu/integration-tests/table_locations-itest.cc b/src/kudu/integration-tests/table_locations-itest.cc
index 1ec6126..efa5169 100644
--- a/src/kudu/integration-tests/table_locations-itest.cc
+++ b/src/kudu/integration-tests/table_locations-itest.cc
@@ -246,6 +246,44 @@ TEST_F(TableLocationsTest, TestGetTableLocations) {
     EXPECT_EQ("a", resp.tablet_locations(0).partition().partition_key_start());
     EXPECT_EQ("aa", resp.tablet_locations(1).partition().partition_key_start());
     EXPECT_EQ("ab", resp.tablet_locations(2).partition().partition_key_start());
+
+    // Check that a UUID was returned for every replica
+    for (const auto& loc : resp.tablet_locations()) {
+      ASSERT_GT(loc.replicas_size(), 0);
+      ASSERT_EQ(0, loc.interned_replicas_size());
+      for (const auto& replica : loc.replicas()) {
+        ASSERT_NE("", replica.ts_info().permanent_uuid());
+      }
+    }
+  }
+
+  { // from "", with TSInfo interning enabled.
+    GetTableLocationsRequestPB req;
+    GetTableLocationsResponsePB resp;
+    RpcController controller;
+    req.mutable_table()->set_table_name(table_name);
+    req.set_partition_key_start("");
+    req.set_max_returned_locations(3);
+    req.set_intern_ts_infos_in_response(true);
+    ASSERT_OK(proxy_->GetTableLocations(req, &resp, &controller));
+    SCOPED_TRACE(SecureDebugString(resp));
+
+    ASSERT_TRUE(!resp.has_error());
+    ASSERT_EQ(3, resp.tablet_locations().size());
+    EXPECT_EQ("a", resp.tablet_locations(0).partition().partition_key_start());
+    EXPECT_EQ("aa", resp.tablet_locations(1).partition().partition_key_start());
+    EXPECT_EQ("ab", resp.tablet_locations(2).partition().partition_key_start());
+    // Check that a UUID was returned for every replica
+    for (const auto& loc : resp.tablet_locations()) {
+      ASSERT_EQ(loc.replicas_size(), 0);
+      ASSERT_GT(loc.interned_replicas_size(), 0);
+      for (const auto& replica : loc.interned_replicas()) {
+        int idx = replica.ts_info_idx();
+        ASSERT_GE(idx, 0);
+        ASSERT_LE(idx, resp.ts_infos_size());
+        ASSERT_NE("", resp.ts_infos(idx).permanent_uuid());
+      }
+    }
   }
 
   { // from "b"
@@ -367,6 +405,7 @@ TEST_F(TableLocationsTest, GetTableLocationsBenchmark) {
           RpcController controller;
           req.mutable_table()->set_table_name(table_name);
           req.set_max_returned_locations(1000);
+          req.set_intern_ts_infos_in_response(true);
           CHECK_OK(proxies[i]->GetTableLocations(req, &resp, &controller));
           CHECK_EQ(resp.tablet_locations_size(), kNumSplits + 1);
         }
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 6d0e27d..88d0d16 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -4616,7 +4616,8 @@ void CatalogManager::SendCreateTabletRequest(const scoped_refptr<TabletInfo>&
ta
 Status CatalogManager::BuildLocationsForTablet(
     const scoped_refptr<TabletInfo>& tablet,
     ReplicaTypeFilter filter,
-    TabletLocationsPB* locs_pb) {
+    TabletLocationsPB* locs_pb,
+    TSInfosDict* ts_infos_dict) {
   TabletMetadataLock l_tablet(tablet.get(), LockMode::READ);
   if (PREDICT_FALSE(l_tablet.data().is_deleted())) {
     return Status::NotFound("Tablet deleted", l_tablet.data().pb.state_msg());
@@ -4632,8 +4633,6 @@ Status CatalogManager::BuildLocationsForTablet(
   const ConsensusStatePB& cstate = l_tablet.data().pb.consensus_state();
   for (const consensus::RaftPeerPB& peer : cstate.committed_config().peers()) {
     DCHECK(!peer.has_health_report()); // Health report shouldn't be persisted.
-    // TODO(adar): GetConsensusRole() iterates over all of the peers, making this an
-    // O(n^2) loop. If replication counts get high, it should be optimized.
     switch (filter) {
       case VOTER_REPLICA:
         if (!peer.has_member_type() ||
@@ -4655,24 +4654,44 @@ Status CatalogManager::BuildLocationsForTablet(
         }
     }
 
-    TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas();
-    replica_pb->set_role(GetConsensusRole(peer.permanent_uuid(), cstate));
-
-    TSInfoPB* tsinfo_pb = replica_pb->mutable_ts_info();
-    tsinfo_pb->set_permanent_uuid(peer.permanent_uuid());
+    // Helper function to create a TSInfoPB.
+    auto make_tsinfo_pb = [this, &peer]() {
+      unique_ptr<TSInfoPB> tsinfo_pb(new TSInfoPB());
+      tsinfo_pb->set_permanent_uuid(peer.permanent_uuid());
+      shared_ptr<TSDescriptor> ts_desc;
+      if (master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc))
{
+        ServerRegistrationPB reg;
+        ts_desc->GetRegistration(&reg);
+        tsinfo_pb->mutable_rpc_addresses()->Swap(reg.mutable_rpc_addresses());
+        if (ts_desc->location()) tsinfo_pb->set_location(*(ts_desc->location()));
+      } else {
+        // If we've never received a heartbeat from the tserver, we'll fall back
+        // to the last known RPC address in the RaftPeerPB.
+        //
+        // TODO(wdberkeley): We should track these RPC addresses in the master table itself.
+        tsinfo_pb->add_rpc_addresses()->CopyFrom(peer.last_known_addr());
+      }
+      return tsinfo_pb;
+    };
 
-    shared_ptr<TSDescriptor> ts_desc;
-    if (master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc))
{
-      ServerRegistrationPB reg;
-      ts_desc->GetRegistration(&reg);
-      tsinfo_pb->mutable_rpc_addresses()->Swap(reg.mutable_rpc_addresses());
-      if (ts_desc->location()) tsinfo_pb->set_location(*(ts_desc->location()));
+    auto role = GetParticipantRole(peer, cstate);
+    if (ts_infos_dict) {
+      int idx = *ComputeIfAbsent(
+          &ts_infos_dict->uuid_to_idx, peer.permanent_uuid(),
+          [&]() -> int {
+            auto pb = make_tsinfo_pb();
+            int idx = ts_infos_dict->ts_info_pbs.size();
+            ts_infos_dict->ts_info_pbs.emplace_back(pb.release());
+            return idx;
+          });
+
+      auto* interned_replica_pb = locs_pb->add_interned_replicas();
+      interned_replica_pb->set_ts_info_idx(idx);
+      interned_replica_pb->set_role(role);
     } else {
-      // If we've never received a heartbeat from the tserver, we'll fall back
-      // to the last known RPC address in the RaftPeerPB.
-      //
-      // TODO: We should track these RPC addresses in the master table itself.
-      tsinfo_pb->add_rpc_addresses()->CopyFrom(peer.last_known_addr());
+      TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas();
+      replica_pb->set_allocated_ts_info(make_tsinfo_pb().release());
+      replica_pb->set_role(role);
     }
   }
 
@@ -4688,6 +4707,7 @@ Status CatalogManager::BuildLocationsForTablet(
 Status CatalogManager::GetTabletLocations(const string& tablet_id,
                                           ReplicaTypeFilter filter,
                                           TabletLocationsPB* locs_pb,
+                                          TSInfosDict* ts_infos_dict,
                                           optional<const string&> user) {
   leader_lock_.AssertAcquiredForReading();
   RETURN_NOT_OK(CheckOnline());
@@ -4710,7 +4730,7 @@ Status CatalogManager::GetTabletLocations(const string& tablet_id,
         NormalizeTableName(table_lock.data().name()), *user));
   }
 
-  return BuildLocationsForTablet(tablet_info, filter, locs_pb);
+  return BuildLocationsForTablet(tablet_info, filter, locs_pb, ts_infos_dict);
 }
 
 Status CatalogManager::ReplaceTablet(const string& tablet_id, ReplaceTabletResponsePB*
resp) {
@@ -4828,9 +4848,12 @@ Status CatalogManager::GetTableLocations(const GetTableLocationsRequestPB*
req,
   vector<scoped_refptr<TabletInfo>> tablets_in_range;
   table->GetTabletsInRange(req, &tablets_in_range);
 
+  TSInfosDict infos_dict;
+
   for (const auto& tablet : tablets_in_range) {
     Status s = BuildLocationsForTablet(
-        tablet, req->replica_type_filter(), resp->add_tablet_locations());
+        tablet, req->replica_type_filter(), resp->add_tablet_locations(),
+        req->intern_ts_infos_in_response() ? &infos_dict : nullptr);
     if (s.ok()) {
       continue;
     }
@@ -4854,6 +4877,9 @@ Status CatalogManager::GetTableLocations(const GetTableLocationsRequestPB*
req,
           << s.ToString();
     }
   }
+  for (auto& pb : infos_dict.ts_info_pbs) {
+    resp->mutable_ts_infos()->AddAllocated(pb.release());
+  }
   resp->set_ttl_millis(FLAGS_table_locations_ttl_ms);
   return Status::OK();
 }
diff --git a/src/kudu/master/catalog_manager.h b/src/kudu/master/catalog_manager.h
index b7c8290..e78cbda 100644
--- a/src/kudu/master/catalog_manager.h
+++ b/src/kudu/master/catalog_manager.h
@@ -34,12 +34,14 @@
 #include <boost/optional/optional.hpp>
 #include <glog/logging.h>
 #include <gtest/gtest_prod.h>
+#include <sparsehash/dense_hash_map>
 
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/ref_counted.h"
+#include "kudu/gutil/strings/stringpiece.h"
 #include "kudu/master/master.pb.h"
 #include "kudu/tserver/tablet_replica_lookup.h"
 #include "kudu/tserver/tserver.pb.h"
@@ -51,6 +53,8 @@
 #include "kudu/util/rw_mutex.h"
 #include "kudu/util/status.h"
 
+template <class X> struct GoodFastHash;
+
 namespace kudu {
 
 class AuthzTokenTest_TestSingleMasterUnavailable_Test;
@@ -102,7 +106,6 @@ class PlacementPolicy;
 class SysCatalogTable;
 class TSDescriptor;
 class TableInfo;
-
 struct DeferredAssignmentActions;
 
 // The data related to a tablet which is persisted on disk.
@@ -600,11 +603,22 @@ class CatalogManager : public tserver::TabletReplicaLookupIf {
                            GetTableLocationsResponsePB* resp,
                            boost::optional<const std::string&> user);
 
+  struct TSInfosDict {
+    std::vector<std::unique_ptr<TSInfoPB>> ts_info_pbs;
+    google::dense_hash_map<StringPiece, int, GoodFastHash<StringPiece>> uuid_to_idx;
+    TSInfosDict() {
+      uuid_to_idx.set_empty_key(StringPiece());
+    }
+  };
+
   // Look up the locations of the given tablet. If 'user' is provided, checks
   // that the user is authorized to get such information. Adds only information
   // on replicas which satisfy the 'filter'. The locations vector is overwritten
   // (not appended to).
   //
+  // If 'ts_infos_dict' is not null, the returned locations use it as a dictionary
+  // to reference TSInfoPBs instead of making many copies.
+  //
   // If the tablet is not found, returns Status::NotFound.
   // If the tablet is not running, returns Status::ServiceUnavailable.
   // Otherwise, returns Status::OK and puts the result in 'locs_pb'.
@@ -612,6 +626,7 @@ class CatalogManager : public tserver::TabletReplicaLookupIf {
   Status GetTabletLocations(const std::string& tablet_id,
                             master::ReplicaTypeFilter filter,
                             TabletLocationsPB* locs_pb,
+                            TSInfosDict* ts_infos_dict,
                             boost::optional<const std::string&> user);
 
   // Replace the given tablet with a new, empty one. The replaced tablet is
@@ -847,13 +862,19 @@ class CatalogManager : public tserver::TabletReplicaLookupIf {
   scoped_refptr<TabletInfo> CreateTabletInfo(const scoped_refptr<TableInfo>&
table,
                                              const PartitionPB& partition);
 
+
   // Builds the TabletLocationsPB for a tablet based on the provided TabletInfo
-  // and the replica type filter specified. Populates locs_pb and returns
-  // Status::OK on success. Returns Status::ServiceUnavailable if tablet is
-  // not running.
+  // and the replica type fiter specified. Populates locs_pb and returns
+  // Status::OK on success.
+  //
+  // If 'ts_infos_dict' is not null, the returned locations use it as a dictionary
+  // to reference TSInfoPBs.
+  //
+  // Returns Status::ServiceUnavailable if tablet is not running.
   Status BuildLocationsForTablet(const scoped_refptr<TabletInfo>& tablet,
                                  master::ReplicaTypeFilter filter,
-                                 TabletLocationsPB* locs_pb);
+                                 TabletLocationsPB* locs_pb,
+                                 TSInfosDict* ts_infos_dict);
 
   // Looks up the table, locks it with the provided lock mode, and, if 'user' is
   // provided, checks that the user is authorized to operate on the table.
diff --git a/src/kudu/master/master.proto b/src/kudu/master/master.proto
index 89afecd..8bbced0 100644
--- a/src/kudu/master/master.proto
+++ b/src/kudu/master/master.proto
@@ -357,11 +357,18 @@ message TSHeartbeatResponsePB {
 //////////////////////////////
 
 message TabletLocationsPB {
+  // DEPRECATED: new clients should prefer the 'Interned' type below.
   message ReplicaPB {
     required TSInfoPB ts_info = 1;
     required consensus.RaftPeerPB.Role role = 2;
   }
 
+  message InternedReplicaPB {
+    // Index into the 'ts_infos' list in the top-level RPC response.
+    required uint32 ts_info_idx = 1;
+    required consensus.RaftPeerPB.Role role = 2;
+  }
+
   required bytes tablet_id = 1;
 
   // DEPRECATED.
@@ -370,8 +377,16 @@ message TabletLocationsPB {
 
   optional PartitionPB partition = 6;
 
+  // Used only if interned replicas are not supported by client.
   repeated ReplicaPB replicas = 4;
 
+  // More efficient representation of replicas: instead of duplicating the TSInfoPB
+  // in each tablet location, instead we just encode indexes into a list of TSInfoPB
+  // which is serialized in the top-level response.
+  //
+  // Used when supported by client.
+  repeated InternedReplicaPB interned_replicas = 7;
+
   // DEPRECATED. Still set by servers, but should be ignored by clients.
   optional bool DEPRECATED_stale = 5;
 }
@@ -404,6 +419,9 @@ message GetTabletLocationsRequestPB {
 
   // What type of tablet replicas to include in the response.
   optional ReplicaTypeFilter replica_type_filter = 2 [ default = VOTER_REPLICA ];
+
+  // NOTE: we don't support the "interned" TSInfoPB response in this request
+  // since this RPC is currently called with a small number of tablets.
 }
 
 message GetTabletLocationsResponsePB {
@@ -497,6 +515,9 @@ message GetTableLocationsRequestPB {
   // What type of tablet replicas to include in the
   // 'GetTableLocationsResponsePB::tablet_locations' response field.
   optional ReplicaTypeFilter replica_type_filter = 6 [ default = VOTER_REPLICA ];
+
+  // Whether the response should use the 'interned_replicas' field.
+  optional bool intern_ts_infos_in_response = 7 [ default = false ];
 }
 
 // The response to a GetTableLocations RPC. The master guarantees that:
@@ -518,6 +539,10 @@ message GetTableLocationsResponsePB {
 
   repeated TabletLocationsPB tablet_locations = 2;
 
+  // Used if 'intern_ts_infos_in_response' was requested.
+  // See InternedReplicaPB above.
+  repeated TSInfoPB ts_infos = 4;
+
   // If the client caches table locations, the entries should not live longer
   // than this timeout. Defaults to one hour.
   optional uint32 ttl_millis = 3 [default = 36000000];
diff --git a/src/kudu/master/master_service.cc b/src/kudu/master/master_service.cc
index eaab0b9..fadfae4 100644
--- a/src/kudu/master/master_service.cc
+++ b/src/kudu/master/master_service.cc
@@ -309,7 +309,9 @@ void MasterServiceImpl::GetTabletLocations(const GetTabletLocationsRequestPB*
re
     // TODO(todd): once we have catalog data. ACL checks would also go here, probably.
     TabletLocationsPB* locs_pb = resp->add_tablet_locations();
     Status s = server_->catalog_manager()->GetTabletLocations(
-        tablet_id, req->replica_type_filter(), locs_pb,
+        tablet_id, req->replica_type_filter(),
+        locs_pb,
+        /*ts_infos_dict=*/nullptr,
         make_optional<const string&>(rpc->remote_user().username()));
     if (!s.ok()) {
       resp->mutable_tablet_locations()->RemoveLast();


Mime
View raw message