flink-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kl0u <...@git.apache.org>
Subject [GitHub] flink pull request #5750: [FLINK-8973] [E2E] HA end-to-end test with StateMa...
Date Fri, 23 Mar 2018 15:10:18 GMT
Github user kl0u commented on a diff in the pull request:

    https://github.com/apache/flink/pull/5750#discussion_r176765903
  
    --- Diff: flink-end-to-end-tests/test-scripts/test_ha.sh ---
    @@ -0,0 +1,108 @@
    +#!/usr/bin/env bash
    +
    +################################################################################
    +# Licensed to the Apache Software Foundation (ASF) under one
    +# or more contributor license agreements.  See the NOTICE file
    +# distributed with this work for additional information
    +# regarding copyright ownership.  The ASF licenses this file
    +# to you under the Apache License, Version 2.0 (the
    +# "License"); you may not use this file except in compliance
    +# with the License.  You may obtain a copy of the License at
    +#
    +#     http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +################################################################################
    +
    +source "$(dirname "$0")"/common.sh
    +
    +#FLINK_DIR=/Users/kkloudas/repos/dataartisans/flink/build-target flink-end-to-end-tests/test-scripts/test_ha.sh
    +TEST_PROGRAM_JAR=$TEST_INFRA_DIR/../../flink-examples/flink-examples-streaming/target/StateMachineExample.jar\
--error-rate\ 0.0\ --sleep\ 2
    +
    +stop_cluster_and_watchdog() {
    +    kill ${watchdogPid} 2> /dev/null
    +    wait ${watchdogPid} 2> /dev/null
    +
    +    stop_ha_cluster
    +}
    +
    +verify_logs() {
    +    expectedRetries=$1
    +
    +    # verify that we have no alerts
    +    if ! [ `cat ${output} | wc -l` -eq 0 ]; then
    +        echo "FAILURE: Alerts found at the StateMachineExample with 0.0 error rate."
    +        PASS=""
    +    fi
    +
    +    # checks that all apart from the first JM recover the failes jobgraph.
    +    if ! [ `grep -r --include '*standalonesession*.log' Recovered SubmittedJobGraph "${FLINK_DIR}/log/"
| cut -d ":" -f 1 | uniq | wc -l` -eq ${expectedRetries} ]; then
    +        echo "FAILURE: A JM did not take over."
    +        PASS=""
    +    fi
    +
    +    # search the logs for JMs that log completed checkpoints
    +    if ! [ `grep -r --include '*standalonesession*.log' Completed checkpoint "${FLINK_DIR}/log/"
| cut -d ":" -f 1 | uniq | wc -l` -eq $((expectedRetries + 1)) ]; then
    +        echo "FAILURE: A JM did not execute the job."
    +        PASS=""
    +    fi
    +}
    +
    +run_ha_test() {
    +    parallelism=$1
    +    backend=$2
    +    async=$3
    +    incremental=$4
    +    maxAttempts=$5
    +    rstrtInterval=$6
    +    output=$7
    +
    +    jmKillAndRetries=2
    +    checkpointDir="${TEST_DATA_DIR}/checkpoints/"
    +
    +    # start the cluster on HA mode and
    +    # verify that all JMs are running
    +    start_ha_cluster
    +
    +    echo "Running on HA mode: parallelism=${parallelism}, backend=${backend}, asyncSnapshots=${async},
and incremSnapshots=${incremental}."
    +
    +    # submit a job in detached mode and let it run
    +    $FLINK_DIR/bin/flink run -d -p ${parallelism} \
    +     $TEST_PROGRAM_JAR \
    +        --stateBackend ${backend} \
    +        --checkpointDir "file://${checkpointDir}" \
    +        --asyncCheckpoints ${async} \
    +        --incrementalCheckpoints ${incremental} \
    +        --restartAttempts ${maxAttempts} \
    +        --restartDelay ${rstrtInterval} \
    +        --output ${output} > /dev/null
    +
    +    # start the watchdog that keeps the number of JMs stable
    +    jm_watchdog 1 "8081" &
    +    watchdogPid=$!
    +
    +    # let the job run for a while to take some checkpoints
    +    sleep 50
    +
    +    for (( c=0; c<${jmKillAndRetries}; c++ )); do
    +        # kill the JM and wait for watchdog to
    +        # create a new JM which will take over
    +        kill_jm 0
    --- End diff --
    
    There is only one running at any point in time.


---

Mime
View raw message