Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions deps/rabbit/src/rabbit_db.erl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
is_virgin_node/0, is_virgin_node/1,
dir/0,
ensure_dir_exists/0,
wipe_data_dir/0,
is_init_finished/0,
clear_init_finished/0]).

Expand Down Expand Up @@ -195,8 +196,32 @@ force_load_on_next_boot_using_mnesia() ->
rabbit_mnesia:force_load_next_boot().

post_reset() ->
%% We assert all Ra systems are stopped because their files are about to
%% be removed.
lists:foreach(
fun(RaSystem) ->
case rabbit_ra_systems:is_running(RaSystem) of
false ->
ok;
true ->
Reason = rabbit_misc:format(
"Ra system '~s' is still running during "
"reset",
[RaSystem]),
?LOG_ERROR(
"DB: ~ts",
[Reason],
#{domain => ?RMQLOG_DOMAIN_DB}),
throw({error, Reason})
end
end, rabbit_ra_systems:all_ra_systems()),

%% We reset the state of feature flags, both in memory and on disk. The
%% state recorded on disk would be deleted with the wipe below anyway.
rabbit_feature_flags:reset(),

wipe_data_dir(),

%% The cluster status files that RabbitMQ uses when Mnesia is the database
%% are initially created from rabbit_prelaunch_cluster. However, it will
%% only be done once the `rabbit` app is restarted. Meanwhile, they are
Expand All @@ -209,6 +234,17 @@ post_reset() ->

ok.

wipe_data_dir() ->
DataDir = dir(),
Glob = filename:join(DataDir, "*"),
FilesToRemove = lists:sort(filelib:wildcard(Glob)),
?LOG_DEBUG(
"DB: wipe files in data directory `~ts`:~n~p",
[DataDir, FilesToRemove],
#{domain => ?RMQLOG_DOMAIN_DB}),
ok = rabbit_file:recursive_delete(FilesToRemove),
ok.

%% -------------------------------------------------------------------
%% is_virgin_node().
%% -------------------------------------------------------------------
Expand All @@ -231,10 +267,7 @@ is_virgin_node_using_mnesia() ->
rabbit_mnesia:is_virgin_node().

is_virgin_node_using_khepri() ->
case rabbit_khepri:is_empty() of
{error, _} -> true;
IsEmpty -> IsEmpty
end.
rabbit_khepri:is_virgin_node().

-spec is_virgin_node(Node) -> IsVirgin | undefined when
Node :: node(),
Expand Down
23 changes: 2 additions & 21 deletions deps/rabbit/src/rabbit_db_cluster.erl
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ join(RemoteNode, NodeType)
%% database because we might change it during the join.
RestartMnesia = rabbit_mnesia:is_running(),
RestartFFCtl = rabbit_ff_controller:is_running(),
RestartRaSystems = rabbit_ra_systems:are_running(),
RestartRabbit = rabbit:is_running(),
case RestartRabbit of
true ->
Expand All @@ -107,10 +106,7 @@ join(RemoteNode, NodeType)
%% Therefore, there are files in the data directory. They
%% will go away with the reset and we will need to restart
%% Ra systems afterwards.
case RestartRaSystems of
true -> ok = rabbit_ra_systems:ensure_stopped();
false -> ok
end,
ok = rabbit_ra_systems:ensure_stopped(),

case RestartFFCtl of
true ->
Expand All @@ -135,7 +131,7 @@ join(RemoteNode, NodeType)
%% `rabbit_ff_registry_wrapper'.
rabbit_ff_registry_factory:acquire_state_change_lock(),
try
ok = rabbit_db:reset(),
rabbit_db:reset(),
rabbit_feature_flags:copy_feature_states_after_reset(
RemoteNode)
after
Expand All @@ -153,21 +149,6 @@ join(RemoteNode, NodeType)

ok = rabbit_node_monitor:notify_left_cluster(node()),

%% Now that the files are all gone after the reset above, restart
%% the Ra systems. They will recreate their folder in the process.
case RestartRabbit of
true ->
ok;
false ->
case RestartRaSystems of
true ->
ok = rabbit_ra_systems:ensure_started(),
ok = rabbit_khepri:setup();
false ->
ok
end
end,

?LOG_INFO(
"DB: joining cluster using remote nodes:~n~tp", [ClusterNodes],
#{domain => ?RMQLOG_DOMAIN_DB}),
Expand Down
1 change: 1 addition & 0 deletions deps/rabbit/src/rabbit_feature_flags.erl
Original file line number Diff line number Diff line change
Expand Up @@ -1331,6 +1331,7 @@ copy_feature_states_after_reset(RemoteNode) ->
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
case do_write_enabled_feature_flags_list(EnabledFeatureNames) of
ok ->
ok = reset_registry(),
ok;
{error, Reason} ->
File = enabled_feature_flags_list_file(),
Expand Down
40 changes: 34 additions & 6 deletions deps/rabbit/src/rabbit_khepri.erl
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
setup/1,
init/1,
reset/0,
is_virgin_node/0,

dir/0,
get_ra_cluster_name/0,
Expand Down Expand Up @@ -297,10 +298,16 @@ setup(_Context) ->
exit(Error)
end.

is_ra_system_running() ->
rabbit_ra_systems:is_running(?RA_SYSTEM).

ensure_ra_system_started() ->
{ok, _} = application:ensure_all_started(khepri),
ok = rabbit_ra_systems:ensure_ra_system_started(?RA_SYSTEM).

ensure_ra_system_stopped() ->
ok = rabbit_ra_systems:ensure_ra_system_stopped(?RA_SYSTEM).

retry_timeout() ->
case application:get_env(rabbit, khepri_leader_wait_retry_timeout) of
{ok, T} when is_integer(T) andalso T >= 0 -> T;
Expand Down Expand Up @@ -375,16 +382,42 @@ reset() ->
false ->
%% Rabbit should be stopped, but Khepri needs to be running.
%% Restart it.
RaSystemRunning = is_ra_system_running(),
ok = setup(),
ok = khepri_cluster:reset(?RA_CLUSTER_NAME),
ok = khepri:stop(?RA_CLUSTER_NAME),
RaSystemRunning orelse ensure_ra_system_stopped(),

_ = file:delete(rabbit_guid:filename()),
ok;
true ->
throw({error, rabbitmq_unexpectedly_running})
end.

is_virgin_node() ->
IsSystemRunning = is_ra_system_running(),
IsStoreRunning = khepri_cluster:is_store_running(?STORE_ID),
case IsSystemRunning of
true -> ok;
false -> ok = ensure_ra_system_started()
end,
case IsStoreRunning of
true -> ok;
false -> ok = setup()
end,

IsEmpty = is_empty() =:= true,

case IsStoreRunning of
true -> ok;
false -> ok = khepri:stop(?RA_CLUSTER_NAME)
end,
case IsSystemRunning of
true -> ok;
false -> ok = ensure_ra_system_stopped()
end,
IsEmpty.

-spec dir() -> Dir when
Dir :: file:filename_all().
%% @doc Returns the Khepri store directory.
Expand Down Expand Up @@ -661,13 +694,8 @@ remove_reachable_member(NodeToRemove) ->
[NodeToRemove, ?RA_CLUSTER_NAME],
#{domain => ?RMQLOG_DOMAIN_GLOBAL}),

%% We need the Khepri store to run on the node to remove, to be
%% able to reset it.
ok = rabbit_misc:rpc_call(
NodeToRemove, ?MODULE, setup, []),

Ret = rabbit_misc:rpc_call(
NodeToRemove, khepri_cluster, reset, [?RA_CLUSTER_NAME]),
NodeToRemove, rabbit_db, reset, []),
case Ret of
ok ->
rabbit_amqqueue:forget_all(NodeToRemove),
Expand Down
7 changes: 3 additions & 4 deletions deps/rabbit/src/rabbit_mnesia.erl
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,8 @@ wipe() ->
%% Erlang system with nodes while not being in an Mnesia cluster
%% with them. We don't handle that well.
[erlang:disconnect_node(N) || N <- cluster_nodes(all)],
%% remove persisted messages and any other garbage we find
ok = rabbit_file:recursive_delete(filelib:wildcard(dir() ++ "/*")),
ok = rabbit_node_monitor:reset_cluster_status(),
%% Historically performed by this function, the data dir is wiped by
%% `rabbit_db' now. Likewise for the cluster status reset.
ok.

-spec change_cluster_node_type(rabbit_db_cluster:node_type()) -> 'ok'.
Expand All @@ -279,7 +278,7 @@ change_cluster_node_type(Type) ->
[] -> e(no_online_cluster_nodes);
[Node0|_] -> Node0
end,
ok = reset(),
ok = rabbit_db:reset(),
ok = join_cluster(Node, Type).

%% We proceed like this: try to remove the node locally. If the node
Expand Down
15 changes: 6 additions & 9 deletions deps/rabbit/src/rabbit_ra_systems.erl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
-export([setup/0,
setup/1,
all_ra_systems/0,
are_running/0,
is_running/1,
ensure_ra_system_started/1,
ensure_ra_system_stopped/1,
ensure_started/0,
Expand Down Expand Up @@ -46,18 +46,15 @@ all_ra_systems() ->
[coordination,
quorum_queues].

-spec are_running() -> AreRunning when
AreRunning :: boolean().
-spec is_running(RaSystem) -> IsRunning when
RaSystem :: ra_system_name(),
IsRunning :: boolean().

are_running() ->
is_running(RaSystem) ->
try
%% FIXME: We hard-code the name of an internal Ra process here.
Children = supervisor:which_children(ra_systems_sup),
lists:all(
fun(RaSystem) ->
is_ra_system_running(Children, RaSystem)
end,
all_ra_systems())
is_ra_system_running(Children, RaSystem)
catch
exit:{noproc, _} ->
false
Expand Down
Loading