-
Notifications
You must be signed in to change notification settings - Fork 104
Closed
Labels
Description
I have a problem where ra:process_command/2 times out. I have made a small example to reproduce the issue.
I start a single-node ra cluster, stop the single server, restart it, and call ra:process_command. Sometimes (1/10 - 1/20 on my machine) process_command times out. Checking the state shows that the single node is 'follower', rather than 'leader'. Calling trigger_election makes it snap out of this situation.
I run this script to trigger the problem:
#!/bin/sh
while true; do
rm -rf ra_dir
mkdir ra_dir
erl -ra data_dir ra_dir -sname foo -s ra_test run
done
ra_test.erl:
-module(ra_test).
-export([run/0]).
%% RA Machine callbacks
-export([init/1, apply/3]).
run() ->
{ok, _} = application:ensure_all_started(ra),
Machine = {module, ?MODULE, #{}},
Members = [{ra_test, node()}],
{ok, [SrvId], []} = ra:start_cluster(ra_test, Machine, Members),
ok = ra:stop_server(SrvId),
ok = ra:restart_server(SrvId),
do_test(SrvId),
erlang:halt().
do_test(SrvId) ->
case ra:process_command(SrvId, test_cmd) of
{ok, ok, SrvId} ->
ok;
Else ->
io:format("** ~p UNEXPECTED REPLY: ~p\n", [self(), Else]),
io:format("** ~p\n", [sys:get_state(ra_test)]),
receive
continue ->
ok
end
end.
%% RA Machine callbacks
init(_) ->
[].
apply(_Meta, test_cmd, S) ->
{S, ok, []}.
Most of the time this works fine, but when it times out, it prints:
** <0.9.0> UNEXPECTED REPLY: {timeout,{ra_test,foo@lygna}}
** {follower,
{state,
{conf,"{ra_test,foo@lygna}",ra_test,ra_test,100,1000,30000,
undefined,25,1000000,30000},
#{aux_state => undefined,
cluster =>
#{{ra_test,foo@lygna} =>
#{commit_index_sent => 0,match_index => 0,
next_index => 1,query_index => 0}},
cluster_change_permitted => false,
cluster_index_term => {0,0},
commit_index => 0,commit_latency => 0,current_term => 1,
effective_machine_module => ra_test,
effective_machine_version => 0,
id =>
{{ra_test,foo@lygna},
<<"RA_TESIRB3YEELDG9U">>,"{ra_test,foo@lygna}"},
last_applied => 0,
log =>
{ra_log,<<"RA_TESIRB3YEELDG9U">>,"{ra_test,foo@lygna}",
"ra_dir/foo@lygna/RA_TESIRB3YEELDG9U",4096,
ra_log_snapshot,20,ra_log_wal,0,0,0,
{0,0},
{ra_snapshot,<<"RA_TESIRB3YEELDG9U">>,ra_log_snapshot,
"ra_dir/foo@lygna/RA_TESIRB3YEELDG9U/snapshots",
undefined,undefined,undefined},
#{},undefined,
{ra_log_reader,
{cfg,<<"RA_TESIRB3YEELDG9U">>,
"ra_dir/foo@lygna/RA_TESIRB3YEELDG9U"},
0,[],
{ra_flru,1,[],#Fun<ra_log_reader.0.104426489>}},
[]},
machine => {machine,ra_test,#{}},
machine_state => [],machine_version => 0,
machine_versions => [{0,0}],
metrics_key => ra_test,pending_consistent_queries => [],
persisted_last_applied => 0,
queries_waiting_heartbeats => {[],[]},
query_index => 0,
voted_for => {ra_test,foo@lygna}},
#{},
[{{<0.137.0>,#Ref<0.456771077.4006871041.7816>},
{command,normal,{'$usr',test_cmd,await_consensus}}}],
undefined,undefined,
{[],[]},
false,0}}
I'm running Erlang/OTP 22.