Skip to content

Commit dc28722

Browse files
fdmananakdave
authored andcommitted
btrfs: keep track of the last logged keys when logging a directory
After the first time we log a directory in the current transaction, for each directory item in a changed leaf of the subvolume tree, we have to check if we previously logged the item, in order to overwrite it in case its data changed or skip it in case its data hasn't changed. Checking if we have logged each item before not only wastes times, but it also adds lock contention on the log tree. So in order to minimize the number of times we do such checks, keep track of the offset of the last key we logged for a directory and, on the next time we log the directory, skip the checks for any new keys that have an offset greater than the offset we have previously saved. This is specially effective for index keys, because the offset for these keys comes from a monotonically increasing counter. This patch is part of a patchset comprised of the following 5 patches: btrfs: remove root argument from btrfs_log_inode() and its callees btrfs: remove redundant log root assignment from log_dir_items() btrfs: factor out the copying loop of dir items from log_dir_items() btrfs: insert items in batches when logging a directory when possible btrfs: keep track of the last logged keys when logging a directory This is patch 5/5. The following test was used on a non-debug kernel to measure the impact it has on a directory fsync: $ cat test-dir-fsync.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/nvme0n1 NUM_NEW_FILES=100000 NUM_FILE_DELETES=1000 mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT mkdir $MNT/testdir for ((i = 1; i <= $NUM_NEW_FILES; i++)); do echo -n > $MNT/testdir/file_$i done # fsync the directory, this will log the new dir items and the inodes # they point to, because these are new inodes. start=$(date +%s%N) xfs_io -c "fsync" $MNT/testdir end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files" # sync to force transaction commit and wipeout the log. sync del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES )) for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do rm -f $MNT/testdir/file_$i done # fsync the directory, this will only log dir items, there are no # dentries pointing to new inodes. start=$(date +%s%N) xfs_io -c "fsync" $MNT/testdir end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files" umount $MNT Test results with NUM_NEW_FILES set to 100 000 and 1 000 000: **** before patchset, 100 000 files, 1000 deletes **** dir fsync took 848 ms after adding 100000 files dir fsync took 175 ms after deleting 1000 files **** after patchset, 100 000 files, 1000 deletes **** dir fsync took 758 ms after adding 100000 files (-11.2%) dir fsync took 63 ms after deleting 1000 files (-94.1%) **** before patchset, 1 000 000 files, 1000 deletes **** dir fsync took 9945 ms after adding 1000000 files dir fsync took 473 ms after deleting 1000 files **** after patchset, 1 000 000 files, 1000 deletes **** dir fsync took 8677 ms after adding 1000000 files (-13.6%) dir fsync took 146 ms after deleting 1000 files (-105.6%) Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 086dcbf commit dc28722

File tree

4 files changed

+75
-13
lines changed

4 files changed

+75
-13
lines changed

fs/btrfs/btrfs_inode.h

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -138,17 +138,34 @@ struct btrfs_inode {
138138
/* a local copy of root's last_log_commit */
139139
int last_log_commit;
140140

141-
/* total number of bytes pending delalloc, used by stat to calc the
142-
* real block usage of the file
143-
*/
144-
u64 delalloc_bytes;
145-
146-
/*
147-
* Total number of bytes pending delalloc that fall within a file
148-
* range that is either a hole or beyond EOF (and no prealloc extent
149-
* exists in the range). This is always <= delalloc_bytes.
150-
*/
151-
u64 new_delalloc_bytes;
141+
union {
142+
/*
143+
* Total number of bytes pending delalloc, used by stat to
144+
* calculate the real block usage of the file. This is used
145+
* only for files.
146+
*/
147+
u64 delalloc_bytes;
148+
/*
149+
* The offset of the last dir item key that was logged.
150+
* This is used only for directories.
151+
*/
152+
u64 last_dir_item_offset;
153+
};
154+
155+
union {
156+
/*
157+
* Total number of bytes pending delalloc that fall within a file
158+
* range that is either a hole or beyond EOF (and no prealloc extent
159+
* exists in the range). This is always <= delalloc_bytes and this
160+
* is used only for files.
161+
*/
162+
u64 new_delalloc_bytes;
163+
/*
164+
* The offset of the last dir index key that was logged.
165+
* This is used only for directories.
166+
*/
167+
u64 last_dir_index_offset;
168+
};
152169

153170
/*
154171
* total number of bytes pending defrag, used by stat to check whether

fs/btrfs/inode.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9161,8 +9161,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
91619161
WARN_ON(inode->block_rsv.reserved);
91629162
WARN_ON(inode->block_rsv.size);
91639163
WARN_ON(inode->outstanding_extents);
9164-
WARN_ON(inode->delalloc_bytes);
9165-
WARN_ON(inode->new_delalloc_bytes);
9164+
if (!S_ISDIR(vfs_inode->i_mode)) {
9165+
WARN_ON(inode->delalloc_bytes);
9166+
WARN_ON(inode->new_delalloc_bytes);
9167+
}
91669168
WARN_ON(inode->csum_bytes);
91679169
WARN_ON(inode->defrag_bytes);
91689170

fs/btrfs/tree-log.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3734,11 +3734,17 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
37343734
const int nritems = btrfs_header_nritems(src);
37353735
const u64 ino = btrfs_ino(inode);
37363736
const bool inode_logged_before = inode_logged(trans, inode);
3737+
u64 last_logged_key_offset;
37373738
bool last_found = false;
37383739
int batch_start = 0;
37393740
int batch_size = 0;
37403741
int i;
37413742

3743+
if (key_type == BTRFS_DIR_ITEM_KEY)
3744+
last_logged_key_offset = inode->last_dir_item_offset;
3745+
else
3746+
last_logged_key_offset = inode->last_dir_index_offset;
3747+
37423748
for (i = path->slots[0]; i < nritems; i++) {
37433749
struct btrfs_key key;
37443750
int ret;
@@ -3750,6 +3756,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
37503756
break;
37513757
}
37523758

3759+
ctx->last_dir_item_offset = key.offset;
37533760
/*
37543761
* We must make sure that when we log a directory entry, the
37553762
* corresponding inode, after log replay, has a matching link
@@ -3786,6 +3793,15 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
37863793

37873794
if (!inode_logged_before)
37883795
goto add_to_batch;
3796+
3797+
/*
3798+
* If we were logged before and have logged dir items, we can skip
3799+
* checking if any item with a key offset larger than the last one
3800+
* we logged is in the log tree, saving time and avoiding adding
3801+
* contention on the log tree.
3802+
*/
3803+
if (key.offset > last_logged_key_offset)
3804+
goto add_to_batch;
37893805
/*
37903806
* Check if the key was already logged before. If not we can add
37913807
* it to a batch for bulk insertion.
@@ -4012,9 +4028,31 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
40124028
int ret;
40134029
int key_type = BTRFS_DIR_ITEM_KEY;
40144030

4031+
/*
4032+
* If this is the first time we are being logged in the current
4033+
* transaction, or we were logged before but the inode was evicted and
4034+
* reloaded later, in which case its logged_trans is 0, reset the values
4035+
* of the last logged key offsets. Note that we don't use the helper
4036+
* function inode_logged() here - that is because the function returns
4037+
* true after an inode eviction, assuming the worst case as it can not
4038+
* know for sure if the inode was logged before. So we can not skip key
4039+
* searches in the case the inode was evicted, because it may not have
4040+
* been logged in this transaction and may have been logged in a past
4041+
* transaction, so we need to reset the last dir item and index offsets
4042+
* to (u64)-1.
4043+
*/
4044+
if (inode->logged_trans != trans->transid) {
4045+
inode->last_dir_item_offset = (u64)-1;
4046+
inode->last_dir_index_offset = (u64)-1;
4047+
}
40154048
again:
40164049
min_key = 0;
40174050
max_key = 0;
4051+
if (key_type == BTRFS_DIR_ITEM_KEY)
4052+
ctx->last_dir_item_offset = inode->last_dir_item_offset;
4053+
else
4054+
ctx->last_dir_item_offset = inode->last_dir_index_offset;
4055+
40184056
while (1) {
40194057
ret = log_dir_items(trans, inode, path, dst_path, key_type,
40204058
ctx, min_key, &max_key);
@@ -4026,8 +4064,11 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
40264064
}
40274065

40284066
if (key_type == BTRFS_DIR_ITEM_KEY) {
4067+
inode->last_dir_item_offset = ctx->last_dir_item_offset;
40294068
key_type = BTRFS_DIR_INDEX_KEY;
40304069
goto again;
4070+
} else {
4071+
inode->last_dir_index_offset = ctx->last_dir_item_offset;
40314072
}
40324073
return 0;
40334074
}

fs/btrfs/tree-log.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
1717
int log_transid;
1818
bool log_new_dentries;
1919
bool logging_new_name;
20+
/* Tracks the last logged dir item/index key offset. */
21+
u64 last_dir_item_offset;
2022
struct inode *inode;
2123
struct list_head list;
2224
/* Only used for fast fsyncs. */

0 commit comments

Comments
 (0)