@@ -3340,6 +3340,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
33403340 return 0 ;
33413341}
33423342
3343+ static bool mark_inode_as_not_logged (const struct btrfs_trans_handle * trans ,
3344+ struct btrfs_inode * inode )
3345+ {
3346+ bool ret = false;
3347+
3348+ /*
3349+ * Do this only if ->logged_trans is still 0 to prevent races with
3350+ * concurrent logging as we may see the inode not logged when
3351+ * inode_logged() is called but it gets logged after inode_logged() did
3352+ * not find it in the log tree and we end up setting ->logged_trans to a
3353+ * value less than trans->transid after the concurrent logging task has
3354+ * set it to trans->transid. As a consequence, subsequent rename, unlink
3355+ * and link operations may end up not logging new names and removing old
3356+ * names from the log.
3357+ */
3358+ spin_lock (& inode -> lock );
3359+ if (inode -> logged_trans == 0 )
3360+ inode -> logged_trans = trans -> transid - 1 ;
3361+ else if (inode -> logged_trans == trans -> transid )
3362+ ret = true;
3363+ spin_unlock (& inode -> lock );
3364+
3365+ return ret ;
3366+ }
3367+
33433368/*
33443369 * Check if an inode was logged in the current transaction. This correctly deals
33453370 * with the case where the inode was logged but has a logged_trans of 0, which
@@ -3357,15 +3382,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
33573382 struct btrfs_key key ;
33583383 int ret ;
33593384
3360- if (inode -> logged_trans == trans -> transid )
3385+ /*
3386+ * Quick lockless call, since once ->logged_trans is set to the current
3387+ * transaction, we never set it to a lower value anywhere else.
3388+ */
3389+ if (data_race (inode -> logged_trans ) == trans -> transid )
33613390 return 1 ;
33623391
33633392 /*
3364- * If logged_trans is not 0, then we know the inode logged was not logged
3365- * in this transaction, so we can return false right away.
3393+ * If logged_trans is not 0 and not trans->transid, then we know the
3394+ * inode was not logged in this transaction, so we can return false
3395+ * right away. We take the lock to avoid a race caused by load/store
3396+ * tearing with a concurrent btrfs_log_inode() call or a concurrent task
3397+ * in this function further below - an update to trans->transid can be
3398+ * teared into two 32 bits updates for example, in which case we could
3399+ * see a positive value that is not trans->transid and assume the inode
3400+ * was not logged when it was.
33663401 */
3367- if (inode -> logged_trans > 0 )
3402+ spin_lock (& inode -> lock );
3403+ if (inode -> logged_trans == trans -> transid ) {
3404+ spin_unlock (& inode -> lock );
3405+ return 1 ;
3406+ } else if (inode -> logged_trans > 0 ) {
3407+ spin_unlock (& inode -> lock );
33683408 return 0 ;
3409+ }
3410+ spin_unlock (& inode -> lock );
33693411
33703412 /*
33713413 * If no log tree was created for this root in this transaction, then
@@ -3374,10 +3416,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
33743416 * transaction's ID, to avoid the search below in a future call in case
33753417 * a log tree gets created after this.
33763418 */
3377- if (!test_bit (BTRFS_ROOT_HAS_LOG_TREE , & inode -> root -> state )) {
3378- inode -> logged_trans = trans -> transid - 1 ;
3379- return 0 ;
3380- }
3419+ if (!test_bit (BTRFS_ROOT_HAS_LOG_TREE , & inode -> root -> state ))
3420+ return mark_inode_as_not_logged (trans , inode );
33813421
33823422 /*
33833423 * We have a log tree and the inode's logged_trans is 0. We can't tell
@@ -3431,29 +3471,17 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
34313471 * Set logged_trans to a value greater than 0 and less then the
34323472 * current transaction to avoid doing the search in future calls.
34333473 */
3434- inode -> logged_trans = trans -> transid - 1 ;
3435- return 0 ;
3474+ return mark_inode_as_not_logged (trans , inode );
34363475 }
34373476
34383477 /*
34393478 * The inode was previously logged and then evicted, set logged_trans to
34403479 * the current transacion's ID, to avoid future tree searches as long as
34413480 * the inode is not evicted again.
34423481 */
3482+ spin_lock (& inode -> lock );
34433483 inode -> logged_trans = trans -> transid ;
3444-
3445- /*
3446- * If it's a directory, then we must set last_dir_index_offset to the
3447- * maximum possible value, so that the next attempt to log the inode does
3448- * not skip checking if dir index keys found in modified subvolume tree
3449- * leaves have been logged before, otherwise it would result in attempts
3450- * to insert duplicate dir index keys in the log tree. This must be done
3451- * because last_dir_index_offset is an in-memory only field, not persisted
3452- * in the inode item or any other on-disk structure, so its value is lost
3453- * once the inode is evicted.
3454- */
3455- if (S_ISDIR (inode -> vfs_inode .i_mode ))
3456- inode -> last_dir_index_offset = (u64 )- 1 ;
3484+ spin_unlock (& inode -> lock );
34573485
34583486 return 1 ;
34593487}
@@ -4045,7 +4073,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
40454073
40464074/*
40474075 * If the inode was logged before and it was evicted, then its
4048- * last_dir_index_offset is (u64)-1 , so we don't the value of the last index
4076+ * last_dir_index_offset is 0 , so we don't know the value of the last index
40494077 * key offset. If that's the case, search for it and update the inode. This
40504078 * is to avoid lookups in the log tree every time we try to insert a dir index
40514079 * key from a leaf changed in the current transaction, and to allow us to always
@@ -4061,7 +4089,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode,
40614089
40624090 lockdep_assert_held (& inode -> log_mutex );
40634091
4064- if (inode -> last_dir_index_offset != ( u64 ) - 1 )
4092+ if (inode -> last_dir_index_offset != 0 )
40654093 return 0 ;
40664094
40674095 if (!ctx -> logged_before ) {
0 commit comments