Skip to content

ra:process_command/2 times out if used immediately after a server restart #179

@mbj4668

Description

@mbj4668

I have a problem where ra:process_command/2 times out. I have made a small example to reproduce the issue.

I start a single-node ra cluster, stop the single server, restart it, and call ra:process_command. Sometimes (1/10 - 1/20 on my machine) process_command times out. Checking the state shows that the single node is 'follower', rather than 'leader'. Calling trigger_election makes it snap out of this situation.

I run this script to trigger the problem:

#!/bin/sh

while true; do
    rm -rf ra_dir
    mkdir ra_dir
    erl -ra data_dir ra_dir -sname foo -s ra_test run
done

ra_test.erl:

-module(ra_test).

-export([run/0]).

%% RA Machine callbacks
-export([init/1, apply/3]).

run() ->
    {ok, _} = application:ensure_all_started(ra),
    Machine = {module, ?MODULE, #{}},
    Members = [{ra_test, node()}],
    {ok, [SrvId], []} = ra:start_cluster(ra_test, Machine, Members),
    ok = ra:stop_server(SrvId),
    ok = ra:restart_server(SrvId),
    do_test(SrvId),
    erlang:halt().

do_test(SrvId) ->
    case ra:process_command(SrvId, test_cmd) of
        {ok, ok, SrvId} ->
            ok;
        Else ->
            io:format("** ~p UNEXPECTED REPLY: ~p\n", [self(), Else]),
            io:format("** ~p\n", [sys:get_state(ra_test)]),
            receive
                continue ->
                    ok
            end
    end.

%% RA Machine callbacks
init(_) ->
    [].

apply(_Meta, test_cmd, S) ->
    {S, ok, []}.

Most of the time this works fine, but when it times out, it prints:

** <0.9.0> UNEXPECTED REPLY: {timeout,{ra_test,foo@lygna}}
** {follower,
       {state,
           {conf,"{ra_test,foo@lygna}",ra_test,ra_test,100,1000,30000,
               undefined,25,1000000,30000},
           #{aux_state => undefined,
             cluster =>
                 #{{ra_test,foo@lygna} =>
                       #{commit_index_sent => 0,match_index => 0,
                         next_index => 1,query_index => 0}},
             cluster_change_permitted => false,
             cluster_index_term => {0,0},
             commit_index => 0,commit_latency => 0,current_term => 1,
             effective_machine_module => ra_test,
             effective_machine_version => 0,
             id =>
                 {{ra_test,foo@lygna},
                  <<"RA_TESIRB3YEELDG9U">>,"{ra_test,foo@lygna}"},
             last_applied => 0, 
             log =>
                 {ra_log,<<"RA_TESIRB3YEELDG9U">>,"{ra_test,foo@lygna}",
                     "ra_dir/foo@lygna/RA_TESIRB3YEELDG9U",4096,
                     ra_log_snapshot,20,ra_log_wal,0,0,0,
                     {0,0},
                     {ra_snapshot,<<"RA_TESIRB3YEELDG9U">>,ra_log_snapshot,
                         "ra_dir/foo@lygna/RA_TESIRB3YEELDG9U/snapshots",
                         undefined,undefined,undefined},
                     #{},undefined,
                     {ra_log_reader,
                         {cfg,<<"RA_TESIRB3YEELDG9U">>,
                             "ra_dir/foo@lygna/RA_TESIRB3YEELDG9U"},
                         0,[],
                         {ra_flru,1,[],#Fun<ra_log_reader.0.104426489>}},
                     []},
             machine => {machine,ra_test,#{}},
             machine_state => [],machine_version => 0,
             machine_versions => [{0,0}],
             metrics_key => ra_test,pending_consistent_queries => [],
             persisted_last_applied => 0,
             queries_waiting_heartbeats => {[],[]},
             query_index => 0,
             voted_for => {ra_test,foo@lygna}},
           #{},
           [{{<0.137.0>,#Ref<0.456771077.4006871041.7816>},
             {command,normal,{'$usr',test_cmd,await_consensus}}}],
           undefined,undefined,
           {[],[]},
           false,0}}

I'm running Erlang/OTP 22.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions