Skip to content

Commit 8d875f9

Browse files
committed
btrfs: disable strict file flushes for renames and truncates
Truncates and renames are often used to replace old versions of a file with new versions. Applications often expect this to be an atomic replacement, even if they haven't done anything to make sure the new version is fully on disk. Btrfs has strict flushing in place to make sure that renaming over an old file with a new file will fully flush out the new file before allowing the transaction commit with the rename to complete. This ordering means the commit code needs to be able to lock file pages, and there are a few paths in the filesystem where we will try to end a transaction with the page lock held. It's rare, but these things can deadlock. This patch removes the ordered flushes and switches to a best effort filemap_flush like ext4 uses. It's not perfect, but it should fix the deadlocks. Signed-off-by: Chris Mason <[email protected]>
1 parent 27b9a81 commit 8d875f9

File tree

8 files changed

+6
-267
lines changed

8 files changed

+6
-267
lines changed

fs/btrfs/btrfs_inode.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,6 @@ struct btrfs_inode {
8484
*/
8585
struct list_head delalloc_inodes;
8686

87-
/*
88-
* list for tracking inodes that must be sent to disk before a
89-
* rename or truncate commit
90-
*/
91-
struct list_head ordered_operations;
92-
9387
/* node for the red-black tree that links inodes in subvolume root */
9488
struct rb_node rb_node;
9589

fs/btrfs/disk-io.c

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
6060
static void free_fs_root(struct btrfs_root *root);
6161
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
6262
int read_only);
63-
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
64-
struct btrfs_root *root);
6563
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
6664
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
6765
struct btrfs_root *root);
@@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
38293827
btrfs_cleanup_transaction(root);
38303828
}
38313829

3832-
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3833-
struct btrfs_root *root)
3834-
{
3835-
struct btrfs_inode *btrfs_inode;
3836-
struct list_head splice;
3837-
3838-
INIT_LIST_HEAD(&splice);
3839-
3840-
mutex_lock(&root->fs_info->ordered_operations_mutex);
3841-
spin_lock(&root->fs_info->ordered_root_lock);
3842-
3843-
list_splice_init(&t->ordered_operations, &splice);
3844-
while (!list_empty(&splice)) {
3845-
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3846-
ordered_operations);
3847-
3848-
list_del_init(&btrfs_inode->ordered_operations);
3849-
spin_unlock(&root->fs_info->ordered_root_lock);
3850-
3851-
btrfs_invalidate_inodes(btrfs_inode->root);
3852-
3853-
spin_lock(&root->fs_info->ordered_root_lock);
3854-
}
3855-
3856-
spin_unlock(&root->fs_info->ordered_root_lock);
3857-
mutex_unlock(&root->fs_info->ordered_operations_mutex);
3858-
}
3859-
38603830
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
38613831
{
38623832
struct btrfs_ordered_extent *ordered;
@@ -4093,8 +4063,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
40934063
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
40944064
struct btrfs_root *root)
40954065
{
4096-
btrfs_destroy_ordered_operations(cur_trans, root);
4097-
40984066
btrfs_destroy_delayed_refs(cur_trans, root);
40994067

41004068
cur_trans->state = TRANS_STATE_COMMIT_START;

fs/btrfs/file.c

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1838,33 +1838,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
18381838

18391839
int btrfs_release_file(struct inode *inode, struct file *filp)
18401840
{
1841-
/*
1842-
* ordered_data_close is set by settattr when we are about to truncate
1843-
* a file from a non-zero size to a zero size. This tries to
1844-
* flush down new bytes that may have been written if the
1845-
* application were using truncate to replace a file in place.
1846-
*/
1847-
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1848-
&BTRFS_I(inode)->runtime_flags)) {
1849-
struct btrfs_trans_handle *trans;
1850-
struct btrfs_root *root = BTRFS_I(inode)->root;
1851-
1852-
/*
1853-
* We need to block on a committing transaction to keep us from
1854-
* throwing a ordered operation on to the list and causing
1855-
* something like sync to deadlock trying to flush out this
1856-
* inode.
1857-
*/
1858-
trans = btrfs_start_transaction(root, 0);
1859-
if (IS_ERR(trans))
1860-
return PTR_ERR(trans);
1861-
btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1862-
btrfs_end_transaction(trans, root);
1863-
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1864-
filemap_flush(inode->i_mapping);
1865-
}
18661841
if (filp->private_data)
18671842
btrfs_ioctl_trans_end(filp);
1843+
filemap_flush(inode->i_mapping);
18681844
return 0;
18691845
}
18701846

fs/btrfs/inode.c

Lines changed: 3 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -7950,27 +7950,6 @@ static int btrfs_truncate(struct inode *inode)
79507950
min_size);
79517951
BUG_ON(ret);
79527952

7953-
/*
7954-
* setattr is responsible for setting the ordered_data_close flag,
7955-
* but that is only tested during the last file release. That
7956-
* could happen well after the next commit, leaving a great big
7957-
* window where new writes may get lost if someone chooses to write
7958-
* to this file after truncating to zero
7959-
*
7960-
* The inode doesn't have any dirty data here, and so if we commit
7961-
* this is a noop. If someone immediately starts writing to the inode
7962-
* it is very likely we'll catch some of their writes in this
7963-
* transaction, and the commit will find this file on the ordered
7964-
* data list with good things to send down.
7965-
*
7966-
* This is a best effort solution, there is still a window where
7967-
* using truncate to replace the contents of the file will
7968-
* end up with a zero length file after a crash.
7969-
*/
7970-
if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7971-
&BTRFS_I(inode)->runtime_flags))
7972-
btrfs_add_ordered_operation(trans, root, inode);
7973-
79747953
/*
79757954
* So if we truncate and then write and fsync we normally would just
79767955
* write the extents that changed, which is a problem if we need to
@@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
81188097
mutex_init(&ei->delalloc_mutex);
81198098
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
81208099
INIT_LIST_HEAD(&ei->delalloc_inodes);
8121-
INIT_LIST_HEAD(&ei->ordered_operations);
81228100
RB_CLEAR_NODE(&ei->rb_node);
81238101

81248102
return inode;
@@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
81588136
if (!root)
81598137
goto free;
81608138

8161-
/*
8162-
* Make sure we're properly removed from the ordered operation
8163-
* lists.
8164-
*/
8165-
smp_mb();
8166-
if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
8167-
spin_lock(&root->fs_info->ordered_root_lock);
8168-
list_del_init(&BTRFS_I(inode)->ordered_operations);
8169-
spin_unlock(&root->fs_info->ordered_root_lock);
8170-
}
8171-
81728139
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
81738140
&BTRFS_I(inode)->runtime_flags)) {
81748141
btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
83508317
ret = 0;
83518318

83528319
/*
8353-
* we're using rename to replace one file with another.
8354-
* and the replacement file is large. Start IO on it now so
8355-
* we don't add too much work to the end of the transaction
8320+
* we're using rename to replace one file with another. Start IO on it
8321+
* now so we don't add too much work to the end of the transaction
83568322
*/
8357-
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
8358-
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8323+
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
83598324
filemap_flush(old_inode->i_mapping);
83608325

83618326
/* close the racy window with snapshot create/destroy ioctl */
@@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
84038368
*/
84048369
btrfs_pin_log_trans(root);
84058370
}
8406-
/*
8407-
* make sure the inode gets flushed if it is replacing
8408-
* something.
8409-
*/
8410-
if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8411-
btrfs_add_ordered_operation(trans, root, old_inode);
84128371

84138372
inode_inc_iversion(old_dir);
84148373
inode_inc_iversion(new_dir);

fs/btrfs/ordered-data.c

Lines changed: 0 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
571571

572572
trace_btrfs_ordered_extent_remove(inode, entry);
573573

574-
/*
575-
* we have no more ordered extents for this inode and
576-
* no dirty pages. We can safely remove it from the
577-
* list of ordered extents
578-
*/
579-
if (RB_EMPTY_ROOT(&tree->tree) &&
580-
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
581-
spin_lock(&root->fs_info->ordered_root_lock);
582-
list_del_init(&BTRFS_I(inode)->ordered_operations);
583-
spin_unlock(&root->fs_info->ordered_root_lock);
584-
}
585-
586574
if (!root->nr_ordered_extents) {
587575
spin_lock(&root->fs_info->ordered_root_lock);
588576
BUG_ON(list_empty(&root->ordered_root));
@@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
686674
mutex_unlock(&fs_info->ordered_operations_mutex);
687675
}
688676

689-
/*
690-
* this is used during transaction commit to write all the inodes
691-
* added to the ordered operation list. These files must be fully on
692-
* disk before the transaction commits.
693-
*
694-
* we have two modes here, one is to just start the IO via filemap_flush
695-
* and the other is to wait for all the io. When we wait, we have an
696-
* extra check to make sure the ordered operation list really is empty
697-
* before we return
698-
*/
699-
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
700-
struct btrfs_root *root, int wait)
701-
{
702-
struct btrfs_inode *btrfs_inode;
703-
struct inode *inode;
704-
struct btrfs_transaction *cur_trans = trans->transaction;
705-
struct list_head splice;
706-
struct list_head works;
707-
struct btrfs_delalloc_work *work, *next;
708-
int ret = 0;
709-
710-
INIT_LIST_HEAD(&splice);
711-
INIT_LIST_HEAD(&works);
712-
713-
mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
714-
spin_lock(&root->fs_info->ordered_root_lock);
715-
list_splice_init(&cur_trans->ordered_operations, &splice);
716-
while (!list_empty(&splice)) {
717-
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
718-
ordered_operations);
719-
inode = &btrfs_inode->vfs_inode;
720-
721-
list_del_init(&btrfs_inode->ordered_operations);
722-
723-
/*
724-
* the inode may be getting freed (in sys_unlink path).
725-
*/
726-
inode = igrab(inode);
727-
if (!inode)
728-
continue;
729-
730-
if (!wait)
731-
list_add_tail(&BTRFS_I(inode)->ordered_operations,
732-
&cur_trans->ordered_operations);
733-
spin_unlock(&root->fs_info->ordered_root_lock);
734-
735-
work = btrfs_alloc_delalloc_work(inode, wait, 1);
736-
if (!work) {
737-
spin_lock(&root->fs_info->ordered_root_lock);
738-
if (list_empty(&BTRFS_I(inode)->ordered_operations))
739-
list_add_tail(&btrfs_inode->ordered_operations,
740-
&splice);
741-
list_splice_tail(&splice,
742-
&cur_trans->ordered_operations);
743-
spin_unlock(&root->fs_info->ordered_root_lock);
744-
ret = -ENOMEM;
745-
goto out;
746-
}
747-
list_add_tail(&work->list, &works);
748-
btrfs_queue_work(root->fs_info->flush_workers,
749-
&work->work);
750-
751-
cond_resched();
752-
spin_lock(&root->fs_info->ordered_root_lock);
753-
}
754-
spin_unlock(&root->fs_info->ordered_root_lock);
755-
out:
756-
list_for_each_entry_safe(work, next, &works, list) {
757-
list_del_init(&work->list);
758-
btrfs_wait_and_free_delalloc_work(work);
759-
}
760-
mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
761-
return ret;
762-
}
763-
764677
/*
765678
* Used to start IO or wait for a given ordered extent to finish.
766679
*
@@ -1120,42 +1033,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
11201033
return index;
11211034
}
11221035

1123-
1124-
/*
1125-
* add a given inode to the list of inodes that must be fully on
1126-
* disk before a transaction commit finishes.
1127-
*
1128-
* This basically gives us the ext3 style data=ordered mode, and it is mostly
1129-
* used to make sure renamed files are fully on disk.
1130-
*
1131-
* It is a noop if the inode is already fully on disk.
1132-
*
1133-
* If trans is not null, we'll do a friendly check for a transaction that
1134-
* is already flushing things and force the IO down ourselves.
1135-
*/
1136-
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1137-
struct btrfs_root *root, struct inode *inode)
1138-
{
1139-
struct btrfs_transaction *cur_trans = trans->transaction;
1140-
u64 last_mod;
1141-
1142-
last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
1143-
1144-
/*
1145-
* if this file hasn't been changed since the last transaction
1146-
* commit, we can safely return without doing anything
1147-
*/
1148-
if (last_mod <= root->fs_info->last_trans_committed)
1149-
return;
1150-
1151-
spin_lock(&root->fs_info->ordered_root_lock);
1152-
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1153-
list_add_tail(&BTRFS_I(inode)->ordered_operations,
1154-
&cur_trans->ordered_operations);
1155-
}
1156-
spin_unlock(&root->fs_info->ordered_root_lock);
1157-
}
1158-
11591036
int __init ordered_data_init(void)
11601037
{
11611038
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",

fs/btrfs/ordered-data.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
190190
struct btrfs_ordered_extent *ordered);
191191
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
192192
u32 *sum, int len);
193-
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
194-
struct btrfs_root *root, int wait);
195-
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
196-
struct btrfs_root *root,
197-
struct inode *inode);
198193
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199194
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200195
void btrfs_get_logged_extents(struct inode *inode,

0 commit comments

Comments
 (0)