kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject incubator-kudu git commit: KUDU-1274: append last error when scan times out
Date Thu, 28 Jan 2016 18:45:18 GMT
Repository: incubator-kudu
Updated Branches:
  refs/heads/master 67f2ed946 -> b7067c645


KUDU-1274: append last error when scan times out

The scan retry code is a real mess; at the very least it should make use
of RpcRetrier to handle retrying. But that's a project for another time.

Change-Id: I7b25b5a6fea18a096143dd82909df2b9a83d7938
Reviewed-on: http://gerrit.cloudera.org:8080/1942
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <todd@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/b7067c64
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/b7067c64
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/b7067c64

Branch: refs/heads/master
Commit: b7067c6453b1b866853a9e3b29f3017ab6d1b973
Parents: 67f2ed9
Author: Adar Dembo <adar@cloudera.com>
Authored: Wed Jan 27 19:01:01 2016 -0800
Committer: Todd Lipcon <todd@apache.org>
Committed: Thu Jan 28 18:41:37 2016 +0000

----------------------------------------------------------------------
 src/kudu/client/client-test.cc      | 32 ++++++++++++++++++++++++++++++--
 src/kudu/client/scanner-internal.cc | 20 ++++++++++++++++++--
 src/kudu/client/scanner-internal.h  | 11 +++++++++++
 3 files changed, 59 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b7067c64/src/kudu/client/client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index bc7edd5..172a634 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -65,14 +65,12 @@ DECLARE_int32(log_inject_latency_ms_mean);
 DECLARE_int32(log_inject_latency_ms_stddev);
 DECLARE_int32(master_inject_latency_on_tablet_lookups_ms);
 DECLARE_int32(max_create_tablets_per_ts);
-DECLARE_int32(rpc_service_queue_length);
 DECLARE_int32(scanner_gc_check_interval_us);
 DECLARE_int32(scanner_inject_latency_on_each_batch_ms);
 DECLARE_int32(scanner_max_batch_size_bytes);
 DECLARE_int32(scanner_ttl_ms);
 DEFINE_int32(test_scan_num_rows, 1000, "Number of rows to insert and scan");
 
-METRIC_DECLARE_counter(scans_started);
 METRIC_DECLARE_counter(rpcs_queue_overflow);
 
 using std::string;
@@ -2688,5 +2686,35 @@ TEST_F(ClientTest, TestServerTooBusyRetry) {
   }
 }
 
+TEST_F(ClientTest, TestLastErrorEmbeddedInScanTimeoutStatus) {
+  // For the random() calls that take place during scan retries.
+  SeedRandom();
+
+  NO_FATALS(InsertTestRows(client_table_.get(), FLAGS_test_scan_num_rows));
+
+  {
+    // Revert the latency injection flags at the end so the test exits faster.
+    google::FlagSaver saver;
+
+    // Restart, but inject latency so that startup is very slow.
+    FLAGS_log_inject_latency = true;
+    FLAGS_log_inject_latency_ms_mean = 1000;
+    FLAGS_log_inject_latency_ms_stddev = 0;
+    for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+      MiniTabletServer* ts = cluster_->mini_tablet_server(i);
+      ASSERT_OK(ts->Restart());
+    }
+
+    // As the tservers are still starting up, the scan will retry until it
+    // times out. The actual error should be embedded in the returned status.
+    KuduScanner scan(client_table_.get());
+    ASSERT_OK(scan.SetTimeoutMillis(1000));
+    Status s = scan.Open();
+    SCOPED_TRACE(s.ToString());
+    ASSERT_TRUE(s.IsTimedOut());
+    ASSERT_STR_CONTAINS(s.ToString(), "Illegal state: Tablet not RUNNING");
+  }
+}
+
 } // namespace client
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b7067c64/src/kudu/client/scanner-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.cc b/src/kudu/client/scanner-internal.cc
index 3b45a0b..3a2ec86 100644
--- a/src/kudu/client/scanner-internal.cc
+++ b/src/kudu/client/scanner-internal.cc
@@ -38,6 +38,8 @@ using std::string;
 namespace kudu {
 
 using rpc::RpcController;
+using strings::Substitute;
+using strings::SubstituteAndAppend;
 using tserver::ColumnRangePredicatePB;
 using tserver::NewScanRequestPB;
 using tserver::ScanResponsePB;
@@ -106,6 +108,7 @@ Status KuduScanner::Data::CanBeRetried(const bool isNewScan,
       !rpc_status.ok() &&
       controller_.error_response() &&
       controller_.error_response()->code() == rpc::ErrorStatusPB::ERROR_SERVER_TOO_BUSY)
{
+    UpdateLastError(rpc_status);
 
     // Exponential backoff with jitter anchored between 10ms and 20ms, and an
     // upper bound between 2.5s and 5s.
@@ -114,7 +117,10 @@ Status KuduScanner::Data::CanBeRetried(const bool isNewScan,
     MonoTime now = MonoTime::Now(MonoTime::FINE);
     now.AddDelta(sleep);
     if (deadline.ComesBefore(now)) {
-      return Status::TimedOut("unable to retry before timeout", rpc_status.ToString());
+      Status ret = Status::TimedOut("unable to retry before timeout",
+                                    rpc_status.ToString());
+      return last_error_.ok() ?
+          ret : ret.CloneAndAppend(last_error_.ToString());
     }
     LOG(INFO) << "Retrying scan to busy tablet server " << ts_->ToString()
               << " after " << sleep.ToString() << "; attempt " <<
scan_attempts_;
@@ -129,9 +135,11 @@ Status KuduScanner::Data::CanBeRetried(const bool isNewScan,
       // We didn't wait a full RPC timeout though, so don't mark the tserver as failed.
       LOG(INFO) << "Scan of tablet " << remote_->tablet_id() << " at
"
           << ts_->ToString() << " deadline expired.";
-      return rpc_status;
+      return last_error_.ok()
+          ? rpc_status : rpc_status.CloneAndAppend(last_error_.ToString());
     } else {
       // All other types of network errors are retriable, and also indicate the tserver is
failed.
+      UpdateLastError(rpc_status);
       table_->client()->data_->meta_cache_->MarkTSFailed(ts_, rpc_status);
     }
   }
@@ -161,6 +169,8 @@ Status KuduScanner::Data::CanBeRetried(const bool isNewScan,
   //   - Any other error    : Fatal. This indicates an unexpected error while processing
the scan
   //                          request.
   if (rpc_status.ok() && !server_status.ok()) {
+    UpdateLastError(server_status);
+
     const tserver::TabletServerErrorPB& error = last_response_.error();
     switch (error.code()) {
       case tserver::TabletServerErrorPB::SCANNER_EXPIRED:
@@ -433,6 +443,12 @@ void KuduScanner::Data::PrepareRequest(RequestType state) {
   }
 }
 
+void KuduScanner::Data::UpdateLastError(const Status& error) {
+  if (last_error_.ok() || last_error_.IsTimedOut()) {
+    last_error_ = error;
+  }
+}
+
 
 ////////////////////////////////////////////////////////////
 // KuduScanBatch

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b7067c64/src/kudu/client/scanner-internal.h
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.h b/src/kudu/client/scanner-internal.h
index b7a1ddc..d25c7ed 100644
--- a/src/kudu/client/scanner-internal.h
+++ b/src/kudu/client/scanner-internal.h
@@ -90,6 +90,10 @@ class KuduScanner::Data {
   // Modifies fields in 'next_req_' in preparation for a new request.
   void PrepareRequest(RequestType state);
 
+  // Update 'last_error_' if need be. Should be invoked whenever a
+  // non-fatal (i.e. retriable) scan error is encountered.
+  void UpdateLastError(const Status& error);
+
   bool open_;
   bool data_in_open_;
   bool has_batch_size_bytes_;
@@ -147,6 +151,13 @@ class KuduScanner::Data {
   // actual storage for the batch that is returned.
   KuduScanBatch batch_for_old_api_;
 
+  // The latest error experienced by this scan that provoked a retry. If the
+  // scan times out, this error will be incorporated into the status that is
+  // passed back to the client.
+  //
+  // TODO: This and the overall scan retry logic duplicates much of RpcRetrier.
+  Status last_error_;
+
   DISALLOW_COPY_AND_ASSIGN(Data);
 };
 


Mime
View raw message