helix-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hu...@apache.org
Subject [helix] branch master updated: Stabilizing 4 flaky tests (#981)
Date Fri, 01 May 2020 00:48:31 GMT
This is an automated email from the ASF dual-hosted git repository.

hulee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git


The following commit(s) were added to refs/heads/master by this push:
     new 5a10292  Stabilizing 4 flaky tests (#981)
5a10292 is described below

commit 5a10292197b7233e6d37d0c669704e8d40bd7d6d
Author: Ali Reza Zamani Zadeh Najari <anajari@linkedin.com>
AuthorDate: Thu Apr 30 17:48:20 2020 -0700

    Stabilizing 4 flaky tests (#981)
    
    Four tests has been stabilized in this commit.
    
    These tests are:
    1-TestJobFailure
    2-TestRebalanceRunningTask
    3-TestTaskRebalancerStopResume
    4-TestTaskSchedulingTwoCurrentStates
    
    TestJobFailure was unstable because we get ExternalView of a resources and if the ExternalView
is not populated yet by the controller, we hit NullPointerException.
    
    TestRebalanceRunningTask was unstable. In this PR, we make sure that the master is existed
in two different nodes (master is switched to new instance) and then we check the assigned
participants.
    
    TestRebalanceStopAndResume was unstable because of Thread.Sleep usage. Instead of stopping
the workflow after some time, we first make sure that workflow and job is IN_PROGRESS and
then stop the workflow.
    
    TestTaskSchedulingTwoCurrent has been stabilized by making sure that master has been switched
to new instance after modifying IS. After that we make sure that task is assigned to the correct
instance and make sure it does not switched to new instance and cancel is not being called
incorrectly.
---
 .../helix/integration/task/TestJobFailure.java     | 11 +++++--
 .../integration/task/TestRebalanceRunningTask.java | 34 +++++++++++++++++++++-
 .../task/TestTaskRebalancerStopResume.java         |  3 +-
 .../task/TestTaskSchedulingTwoCurrentStates.java   | 16 ++++++++++
 4 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/helix-core/src/test/java/org/apache/helix/integration/task/TestJobFailure.java
b/helix-core/src/test/java/org/apache/helix/integration/task/TestJobFailure.java
index 1bf4221..b06cebb 100644
--- a/helix-core/src/test/java/org/apache/helix/integration/task/TestJobFailure.java
+++ b/helix-core/src/test/java/org/apache/helix/integration/task/TestJobFailure.java
@@ -74,7 +74,7 @@ public final class TestJobFailure extends TaskSynchronizedTestBase {
   @Test(dataProvider = "testJobFailureInput")
   public void testNormalJobFailure(String comment, List<String> taskStates,
       List<String> expectedTaskEndingStates, String expectedJobEndingStates,
-      String expectedWorkflowEndingStates) throws InterruptedException {
+      String expectedWorkflowEndingStates) throws Exception {
     final String JOB_NAME = "test_job";
     final String WORKFLOW_NAME = TestHelper.getTestMethodName() + testNum++;
     System.out.println("Test case comment: " + comment);
@@ -118,8 +118,15 @@ public final class TestJobFailure extends TaskSynchronizedTestBase {
   }
 
   private Map<String, Map<String, String>> createPartitionConfig(List<String>
taskStates,
-      List<String> expectedTaskEndingStates) {
+      List<String> expectedTaskEndingStates) throws Exception {
     Map<String, Map<String, String>> targetPartitionConfigs = new HashMap<>();
+    // Make sure external view has been created for the resource
+    Assert.assertTrue(TestHelper.verify(() -> {
+      ExternalView externalView =
+          _manager.getClusterManagmentTool().getResourceExternalView(CLUSTER_NAME, DB_NAME);
+      return externalView != null;
+    }, TestHelper.WAIT_DURATION));
+
     ExternalView externalView =
         _manager.getClusterManagmentTool().getResourceExternalView(CLUSTER_NAME, DB_NAME);
     Set<String> partitionSet = externalView.getPartitionSet();
diff --git a/helix-core/src/test/java/org/apache/helix/integration/task/TestRebalanceRunningTask.java
b/helix-core/src/test/java/org/apache/helix/integration/task/TestRebalanceRunningTask.java
index 4473cce..e659797 100644
--- a/helix-core/src/test/java/org/apache/helix/integration/task/TestRebalanceRunningTask.java
+++ b/helix-core/src/test/java/org/apache/helix/integration/task/TestRebalanceRunningTask.java
@@ -20,6 +20,7 @@ package org.apache.helix.integration.task;
  */
 
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
 
 import com.google.common.collect.ImmutableMap;
@@ -27,6 +28,7 @@ import com.google.common.collect.Sets;
 import org.apache.helix.TestHelper;
 import org.apache.helix.integration.manager.ClusterControllerManager;
 import org.apache.helix.integration.manager.MockParticipantManager;
+import org.apache.helix.model.ExternalView;
 import org.apache.helix.model.MasterSlaveSMD;
 import org.apache.helix.task.JobConfig;
 import org.apache.helix.task.JobContext;
@@ -239,7 +241,7 @@ public final class TestRebalanceRunningTask extends TaskSynchronizedTestBase
{
    * Story: new node added
    */
   @Test
-  public void testFixedTargetTaskAndDisabledRebalanceAndNodeAdded() throws InterruptedException
{
+  public void testFixedTargetTaskAndDisabledRebalanceAndNodeAdded() throws Exception {
     WORKFLOW = TestHelper.getTestMethodName();
     JobConfig.Builder jobBuilder =
         new JobConfig.Builder().setWorkflow(WORKFLOW).setTargetResource(DATABASE)
@@ -261,6 +263,36 @@ public final class TestRebalanceRunningTask extends TaskSynchronizedTestBase
{
         new BestPossibleExternalViewVerifier.Builder(CLUSTER_NAME).setZkClient(_gZkClient)
             .setResources(Sets.newHashSet(DATABASE)).build();
     Assert.assertTrue(clusterVerifier.verify(10 * 1000));
+
+    // Wait until master is switched to new instance and two masters exist on two different
instances
+    boolean isMasterOnTwoDifferentNodes = TestHelper.verify(() -> {
+      Set<String> masterInstances = new HashSet<>();
+      ExternalView externalView =
+          _gSetupTool.getClusterManagementTool().getResourceExternalView(CLUSTER_NAME, DATABASE);
+      if (externalView == null) {
+        return false;
+      }
+
+      Map<String, String> stateMap0 = externalView.getStateMap(DATABASE + "_0");
+      Map<String, String> stateMap1 = externalView.getStateMap(DATABASE + "_1");
+      if (stateMap0 == null || stateMap1 == null) {
+        return false;
+      }
+
+      for (Map.Entry<String, String> entry : stateMap0.entrySet()) {
+        if (entry.getValue().equals("MASTER")) {
+          masterInstances.add(entry.getKey());
+        }
+      }
+      for (Map.Entry<String, String> entry : stateMap1.entrySet()) {
+        if (entry.getValue().equals("MASTER")) {
+          masterInstances.add(entry.getKey());
+        }
+      }
+      return masterInstances.size() == 2;
+    }, TestHelper.WAIT_DURATION);
+    Assert.assertTrue(isMasterOnTwoDifferentNodes);
+
     // Running tasks are also rebalanced, even though RebalanceRunningTask is disabled
     Assert.assertTrue(checkTasksOnDifferentInstances());
   }
diff --git a/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskRebalancerStopResume.java
b/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskRebalancerStopResume.java
index 0a10a72..31f6d43 100644
--- a/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskRebalancerStopResume.java
+++ b/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskRebalancerStopResume.java
@@ -503,7 +503,8 @@ public class TestTaskRebalancerStopResume extends TaskTestBase {
     builder.addJob(job2Name, job2);
 
     _driver.start(builder.build());
-    Thread.sleep(1000);
+    _driver.pollForWorkflowState(workflowName, TaskState.IN_PROGRESS);
+    _driver.pollForJobState(workflowName, TaskUtil.getNamespacedJobName(workflowName, job1Name),
TaskState.IN_PROGRESS);
     _driver.stop(workflowName);
     _driver.pollForWorkflowState(workflowName, TaskState.STOPPING);
 
diff --git a/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java
b/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java
index 5b63b44..85996e7 100644
--- a/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java
+++ b/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java
@@ -36,6 +36,7 @@ import org.apache.helix.integration.manager.MockParticipantManager;
 import org.apache.helix.manager.zk.ZKHelixDataAccessor;
 import org.apache.helix.model.ClusterConfig;
 import org.apache.helix.model.CurrentState;
+import org.apache.helix.model.ExternalView;
 import org.apache.helix.model.IdealState;
 import org.apache.helix.model.MasterSlaveSMD;
 import org.apache.helix.model.Partition;
@@ -138,6 +139,21 @@ public class TestTaskSchedulingTwoCurrentStates extends TaskTestBase
{
     JobQueue.Builder jobQueue = TaskTestUtil.buildJobQueue(jobQueueName);
     jobQueue.enqueueJob("JOB0", jobBuilder0);
 
+    // Make sure master has been correctly switched to Participant1
+    boolean isMasterSwitchedToCorrectInstance = TestHelper.verify(() -> {
+      ExternalView externalView =
+          _gSetupTool.getClusterManagementTool().getResourceExternalView(CLUSTER_NAME, DATABASE);
+      if (externalView == null) {
+        return false;
+      }
+      Map<String, String> stateMap = externalView.getStateMap(DATABASE + "_0");
+      if (stateMap == null) {
+        return false;
+      }
+      return "MASTER".equals(stateMap.get(PARTICIPANT_PREFIX + "_" + (_startPort + 1)));
+    }, TestHelper.WAIT_DURATION);
+    Assert.assertTrue(isMasterSwitchedToCorrectInstance);
+
     _driver.start(jobQueue.build());
 
     String namespacedJobName = TaskUtil.getNamespacedJobName(jobQueueName, "JOB0");


Mime
View raw message