Skip to content

Commit a87f5b1

Browse files
committed
Handle races with quorum_queue feature flag migration fun
In a few places, the migration of the `rabbit_queue` and `rabbit_durable_queue` Mnesia tables might conflict with accesses to those tables. [#159298729]
1 parent 93168ae commit a87f5b1

File tree

7 files changed

+134
-12
lines changed

7 files changed

+134
-12
lines changed

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,10 @@ ifdef CREDIT_FLOW_TRACING
231231
RMQ_ERLC_OPTS += -DCREDIT_FLOW_TRACING=true
232232
endif
233233

234+
ifdef DEBUG_FF
235+
RMQ_ERLC_OPTS += -DDEBUG_QUORUM_QUEUE_FF=true
236+
endif
237+
234238
ifndef USE_PROPER_QC
235239
# PropEr needs to be installed for property checking
236240
# http://proper.softlab.ntua.gr/

include/amqqueue.hrl

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,26 @@
111111
?amqqueue_v2_vhost(Q) =:= VHost) orelse
112112
(?is_amqqueue_v1(Q) andalso
113113
?amqqueue_v1_vhost(Q) =:= VHost))).
114+
115+
-ifdef(DEBUG_QUORUM_QUEUE_FF).
116+
-define(enable_quorum_queue_if_debug,
117+
begin
118+
rabbit_log:info(
119+
"---- ENABLING quorum_queue as part of "
120+
"?try_mnesia_tx_or_upgrade_amqqueue_and_retry() ----"),
121+
ok = rabbit_feature_flags:enable(quorum_queue)
122+
end).
123+
-else.
124+
-define(enable_quorum_queue_if_debug, noop).
125+
-endif.
126+
127+
-define(try_mnesia_tx_or_upgrade_amqqueue_and_retry(Expr1, Expr2),
128+
try
129+
?enable_quorum_queue_if_debug,
130+
Expr1
131+
catch
132+
throw:{error, {bad_type, T}} when ?is_amqqueue(T) ->
133+
Expr2;
134+
throw:{aborted, {bad_type, T}} when ?is_amqqueue(T) ->
135+
Expr2
136+
end).

src/rabbit_amqqueue.erl

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030
-export([list/0, list/1, info_keys/0, info/1, info/2, info_all/1, info_all/2,
3131
emit_info_all/5, list_local/1, info_local/1,
3232
emit_info_local/4, emit_info_down/4]).
33-
-export([list_down/1, count/1, list_names/0, list_names/1, list_local_names/0]).
33+
-export([list_down/1, count/1, list_names/0, list_names/1, list_local_names/0,
34+
list_with_possible_retry/1]).
3435
-export([list_by_type/1]).
3536
-export([notify_policy_changed/1]).
3637
-export([consumers/1, consumers_all/1, emit_consumers_all/4, consumer_info_keys/0]).
@@ -297,6 +298,11 @@ start(Qs) ->
297298
ok.
298299

299300
mark_local_durable_queues_stopped(VHost) ->
301+
?try_mnesia_tx_or_upgrade_amqqueue_and_retry(
302+
do_mark_local_durable_queues_stopped(VHost),
303+
do_mark_local_durable_queues_stopped(VHost)).
304+
305+
do_mark_local_durable_queues_stopped(VHost) ->
300306
Qs = find_local_durable_classic_queues(VHost),
301307
rabbit_misc:execute_mnesia_transaction(
302308
fun() ->
@@ -426,13 +432,21 @@ get_queue_type(Args) ->
426432
erlang:binary_to_existing_atom(V, utf8)
427433
end.
428434

429-
internal_declare(Q, true) ->
435+
internal_declare(Q, Recover) ->
436+
?try_mnesia_tx_or_upgrade_amqqueue_and_retry(
437+
do_internal_declare(Q, Recover),
438+
begin
439+
Q1 = amqqueue:upgrade(Q),
440+
do_internal_declare(Q1, Recover)
441+
end).
442+
443+
do_internal_declare(Q, true) ->
430444
rabbit_misc:execute_mnesia_tx_with_tail(
431445
fun () ->
432446
ok = store_queue(amqqueue:set_state(Q, live)),
433447
rabbit_misc:const({created, Q})
434448
end);
435-
internal_declare(Q, false) ->
449+
do_internal_declare(Q, false) ->
436450
QueueName = amqqueue:get_name(Q),
437451
rabbit_misc:execute_mnesia_tx_with_tail(
438452
fun () ->
@@ -468,6 +482,14 @@ update(Name, Fun) ->
468482
%% only really used for quorum queues to ensure the rabbit_queue record
469483
%% is initialised
470484
ensure_rabbit_queue_record_is_initialized(Q) ->
485+
?try_mnesia_tx_or_upgrade_amqqueue_and_retry(
486+
do_ensure_rabbit_queue_record_is_initialized(Q),
487+
begin
488+
Q1 = amqqueue:upgrade(Q),
489+
do_ensure_rabbit_queue_record_is_initialized(Q1)
490+
end).
491+
492+
do_ensure_rabbit_queue_record_is_initialized(Q) ->
471493
rabbit_misc:execute_mnesia_tx_with_tail(
472494
fun () ->
473495
ok = store_queue(Q),
@@ -794,7 +816,11 @@ check_queue_type({Type, _}, _Args) ->
794816
{error, {unacceptable_type, Type}}.
795817

796818

797-
list() -> mnesia:dirty_match_object(rabbit_queue, amqqueue:pattern_match_all()).
819+
list() ->
820+
list_with_possible_retry(fun do_list/0).
821+
822+
do_list() ->
823+
mnesia:dirty_match_object(rabbit_queue, amqqueue:pattern_match_all()).
798824

799825
list_names() -> mnesia:dirty_all_keys(rabbit_queue).
800826

@@ -828,9 +854,12 @@ is_local_to_node({_, Leader} = QPid, Node) when ?IS_QUORUM(QPid) ->
828854
list(VHostPath) ->
829855
list(VHostPath, rabbit_queue).
830856

857+
list(VHostPath, TableName) ->
858+
list_with_possible_retry(fun() -> do_list(VHostPath, TableName) end).
859+
831860
%% Not dirty_match_object since that would not be transactional when used in a
832861
%% tx context
833-
list(VHostPath, TableName) ->
862+
do_list(VHostPath, TableName) ->
834863
mnesia:async_dirty(
835864
fun () ->
836865
mnesia:match_object(
@@ -839,6 +868,38 @@ list(VHostPath, TableName) ->
839868
read)
840869
end).
841870

871+
list_with_possible_retry(Fun) ->
872+
%% amqqueue migration:
873+
%% The `rabbit_queue` or `rabbit_durable_queue` tables
874+
%% might be migrated between the time we query the pattern
875+
%% (with the `amqqueue` module) and the time we call
876+
%% `mnesia:dirty_match_object()`. This would lead to an empty list
877+
%% (no object matching the now incorrect pattern), not a Mnesia
878+
%% error.
879+
%%
880+
%% So if the result is an empty list and the version of the
881+
%% `amqqueue` record changed in between, we retry the operation.
882+
%%
883+
%% However, we don't do this if inside a Mnesia transaction: we
884+
%% could end up with a live lock between this started transaction
885+
%% and the Mnesia table migration which is blocked (but the
886+
%% rabbit_feature_flags lock is held).
887+
AmqqueueRecordVersion = amqqueue:record_version_to_use(),
888+
case Fun() of
889+
[] ->
890+
case mnesia:is_transaction() of
891+
true ->
892+
[];
893+
false ->
894+
case amqqueue:record_version_to_use() of
895+
AmqqueueRecordVersion -> [];
896+
_ -> Fun()
897+
end
898+
end;
899+
Ret ->
900+
Ret
901+
end.
902+
842903
list_down(VHostPath) ->
843904
case rabbit_vhost:exists(VHostPath) of
844905
false -> [];
@@ -859,13 +920,21 @@ count(VHost) ->
859920
%% won't work here because with master migration of mirrored queues
860921
%% the "ownership" of queues by nodes becomes a non-trivial problem
861922
%% that requires a proper consensus algorithm.
862-
length(mnesia:dirty_index_read(rabbit_queue, VHost, amqqueue:field_vhost()))
923+
length(list_for_count(VHost))
863924
catch _:Err ->
864925
rabbit_log:error("Failed to fetch number of queues in vhost ~p:~n~p~n",
865926
[VHost, Err]),
866927
0
867928
end.
868929

930+
list_for_count(VHost) ->
931+
list_with_possible_retry(
932+
fun() ->
933+
mnesia:dirty_index_read(rabbit_queue,
934+
VHost,
935+
amqqueue:field_vhost())
936+
end).
937+
869938
info_keys() -> rabbit_amqqueue_process:info_keys().
870939

871940
map(Qs, F) -> rabbit_misc:filter_exit_map(F, Qs).

src/rabbit_amqqueue_process.erl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,9 @@ terminate(shutdown = R, State = #q{backing_queue = BQ, q = Q0}) ->
294294
fun() ->
295295
[Q] = mnesia:read({rabbit_queue, QName}),
296296
Q2 = amqqueue:set_state(Q, stopped),
297+
%% amqqueue migration:
298+
%% The amqqueue was read from this transaction, no need
299+
%% to handle migration.
297300
rabbit_amqqueue:store_queue(Q2)
298301
end),
299302
BQ:terminate(R, BQS)
@@ -320,7 +323,12 @@ terminate(_Reason, State = #q{q = Q}) ->
320323
Q2 = amqqueue:set_state(Q, crashed),
321324
rabbit_misc:execute_mnesia_transaction(
322325
fun() ->
323-
rabbit_amqqueue:store_queue(Q2)
326+
?try_mnesia_tx_or_upgrade_amqqueue_and_retry(
327+
rabbit_amqqueue:store_queue(Q2),
328+
begin
329+
Q3 = amqqueue:upgrade(Q2),
330+
rabbit_amqqueue:store_queue(Q3)
331+
end)
324332
end),
325333
BQS
326334
end, State).

src/rabbit_mirror_queue_master.erl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ init_with_existing_bq(Q0, BQ, BQS) when ?is_amqqueue(Q0) ->
115115
GMPids1 = [{GM, Self} | GMPids0],
116116
Q2 = amqqueue:set_gm_pids(Q1, GMPids1),
117117
Q3 = amqqueue:set_state(Q2, live),
118+
%% amqqueue migration:
119+
%% The amqqueue was read from this transaction, no
120+
%% need to handle migration.
118121
ok = rabbit_amqqueue:store_queue(Q3)
119122
end,
120123
ok = rabbit_misc:execute_mnesia_transaction(Fun),

src/rabbit_mirror_queue_misc.erl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,9 @@ store_updated_slaves(Q0) when ?is_amqqueue(Q0) ->
300300
RS1 = update_recoverable(SPids, RS0),
301301
Q2 = amqqueue:set_recoverable_slaves(Q1, RS1),
302302
Q3 = amqqueue:set_state(Q2, live),
303+
%% amqqueue migration:
304+
%% The amqqueue was read from this transaction, no need to handle
305+
%% migration.
303306
ok = rabbit_amqqueue:store_queue(Q3),
304307
%% Wake it up so that we emit a stats event
305308
rabbit_amqqueue:notify_policy_changed(Q3),

src/rabbit_policy.erl

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,11 @@ recover() ->
184184
%% variants.
185185
recover0() ->
186186
Xs = mnesia:dirty_match_object(rabbit_durable_exchange, #exchange{_ = '_'}),
187-
Qs = mnesia:dirty_match_object(rabbit_durable_queue, amqqueue:pattern_match_all()),
187+
Qs = rabbit_amqqueue:list_with_possible_retry(
188+
fun() ->
189+
mnesia:dirty_match_object(
190+
rabbit_durable_queue, amqqueue:pattern_match_all())
191+
end),
188192
Policies = list(),
189193
OpPolicies = list_op(),
190194
[rabbit_misc:execute_mnesia_transaction(
@@ -203,10 +207,18 @@ recover0() ->
203207
OpPolicy1 = match(QName, OpPolicies),
204208
Q2 = amqqueue:set_operator_policy(Q1, OpPolicy1),
205209
Q3 = rabbit_queue_decorator:set(Q2),
206-
F = fun () ->
207-
mnesia:write(rabbit_durable_queue, Q3, write)
208-
end,
209-
rabbit_misc:execute_mnesia_transaction(F)
210+
?try_mnesia_tx_or_upgrade_amqqueue_and_retry(
211+
rabbit_misc:execute_mnesia_transaction(
212+
fun () ->
213+
mnesia:write(rabbit_durable_queue, Q3, write)
214+
end),
215+
begin
216+
Q4 = amqqueue:upgrade(Q3),
217+
rabbit_misc:execute_mnesia_transaction(
218+
fun () ->
219+
mnesia:write(rabbit_durable_queue, Q4, write)
220+
end)
221+
end)
210222
end || Q0 <- Qs],
211223
ok.
212224

0 commit comments

Comments
 (0)