Skip to content

Commit e08ac99

Browse files
ablagodarenkotytso
authored andcommitted
ext4: add largedir feature
This INCOMPAT_LARGEDIR feature allows larger directories to be created in ldiskfs, both with directory sizes over 2GB and and a maximum htree depth of 3 instead of the current limit of 2. These features are needed in order to exceed the current limit of approximately 10M entries in a single directory. This patch was originally written by Yang Sheng to support the Lustre server. [ Bumped the credits needed to update an indexed directory -- tytso ] Signed-off-by: Liang Zhen <[email protected]> Signed-off-by: Yang Sheng <[email protected]> Signed-off-by: Artem Blagodarenko <[email protected]> Signed-off-by: Theodore Ts'o <[email protected]> Reviewed-by: Andreas Dilger <[email protected]>
1 parent 67a7d5f commit e08ac99

File tree

4 files changed

+113
-47
lines changed

4 files changed

+113
-47
lines changed

fs/ext4/ext4.h

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1800,7 +1800,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
18001800
EXT4_FEATURE_INCOMPAT_MMP | \
18011801
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
18021802
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
1803-
EXT4_FEATURE_INCOMPAT_CSUM_SEED)
1803+
EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
1804+
EXT4_FEATURE_INCOMPAT_LARGEDIR)
18041805
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
18051806
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
18061807
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2126,6 +2127,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
21262127
*/
21272128
#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
21282129

2130+
/* htree levels for ext4 */
2131+
#define EXT4_HTREE_LEVEL_COMPAT 2
2132+
#define EXT4_HTREE_LEVEL 3
2133+
2134+
static inline int ext4_dir_htree_level(struct super_block *sb)
2135+
{
2136+
return ext4_has_feature_largedir(sb) ?
2137+
EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
2138+
}
2139+
21292140
/*
21302141
* Timeout and state flag for lazy initialization inode thread.
21312142
*/
@@ -2756,13 +2767,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
27562767
es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
27572768
}
27582769

2759-
static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
2770+
static inline loff_t ext4_isize(struct super_block *sb,
2771+
struct ext4_inode *raw_inode)
27602772
{
2761-
if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
2773+
if (ext4_has_feature_largedir(sb) ||
2774+
S_ISREG(le16_to_cpu(raw_inode->i_mode)))
27622775
return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
27632776
le32_to_cpu(raw_inode->i_size_lo);
2764-
else
2765-
return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
2777+
2778+
return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
27662779
}
27672780

27682781
static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)

fs/ext4/ext4_jbd2.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,14 @@
7777

7878
#define EXT4_RESERVE_TRANS_BLOCKS 12U
7979

80-
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
80+
/*
81+
* Number of credits needed if we need to insert an entry into a
82+
* directory. For each new index block, we need 4 blocks (old index
83+
* block, new index block, bitmap block, bg summary). For normal
84+
* htree directories there are 2 levels; if the largedir feature
85+
* enabled it's 3 levels.
86+
*/
87+
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U
8188

8289
#ifdef CONFIG_QUOTA
8390
/* Amount of blocks needed for quota update - we know that the structure was

fs/ext4/inode.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4712,7 +4712,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
47124712
if (ext4_has_feature_64bit(sb))
47134713
ei->i_file_acl |=
47144714
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4715-
inode->i_size = ext4_isize(raw_inode);
4715+
inode->i_size = ext4_isize(sb, raw_inode);
47164716
if ((size = i_size_read(inode)) < 0) {
47174717
EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
47184718
ret = -EFSCORRUPTED;
@@ -5037,7 +5037,7 @@ static int ext4_do_update_inode(handle_t *handle,
50375037
raw_inode->i_file_acl_high =
50385038
cpu_to_le16(ei->i_file_acl >> 32);
50395039
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
5040-
if (ei->i_disksize != ext4_isize(raw_inode)) {
5040+
if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
50415041
ext4_isize_set(raw_inode, ei->i_disksize);
50425042
need_datasync = 1;
50435043
}

fs/ext4/namei.c

Lines changed: 85 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
513513

514514
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
515515
{
516-
return le32_to_cpu(entry->block) & 0x00ffffff;
516+
return le32_to_cpu(entry->block) & 0x0fffffff;
517517
}
518518

519519
static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
@@ -739,6 +739,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
739739
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
740740
u32 hash;
741741

742+
memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
742743
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
743744
if (IS_ERR(frame->bh))
744745
return (struct dx_frame *) frame->bh;
@@ -768,9 +769,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
768769
}
769770

770771
indirect = root->info.indirect_levels;
771-
if (indirect > 1) {
772-
ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
773-
root->info.indirect_levels);
772+
if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
773+
ext4_warning(dir->i_sb,
774+
"Directory (ino: %lu) htree depth %#06x exceed"
775+
"supported value", dir->i_ino,
776+
ext4_dir_htree_level(dir->i_sb));
777+
if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
778+
ext4_warning(dir->i_sb, "Enable large directory "
779+
"feature to access it");
780+
}
774781
goto fail;
775782
}
776783

@@ -859,12 +866,19 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
859866

860867
static void dx_release(struct dx_frame *frames)
861868
{
869+
struct dx_root_info *info;
870+
int i;
871+
862872
if (frames[0].bh == NULL)
863873
return;
864874

865-
if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
866-
brelse(frames[1].bh);
867-
brelse(frames[0].bh);
875+
info = &((struct dx_root *)frames[0].bh->b_data)->info;
876+
for (i = 0; i <= info->indirect_levels; i++) {
877+
if (frames[i].bh == NULL)
878+
break;
879+
brelse(frames[i].bh);
880+
frames[i].bh = NULL;
881+
}
868882
}
869883

870884
/*
@@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
10501064
{
10511065
struct dx_hash_info hinfo;
10521066
struct ext4_dir_entry_2 *de;
1053-
struct dx_frame frames[2], *frame;
1067+
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
10541068
struct inode *dir;
10551069
ext4_lblk_t block;
10561070
int count = 0;
@@ -1485,7 +1499,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
14851499
struct ext4_dir_entry_2 **res_dir)
14861500
{
14871501
struct super_block * sb = dir->i_sb;
1488-
struct dx_frame frames[2], *frame;
1502+
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
14891503
struct buffer_head *bh;
14901504
ext4_lblk_t block;
14911505
int retval;
@@ -1889,7 +1903,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
18891903
*/
18901904
dir->i_mtime = dir->i_ctime = current_time(dir);
18911905
ext4_update_dx_flag(dir);
1892-
dir->i_version++;
1906+
inode_inc_iversion(dir);
18931907
ext4_mark_inode_dirty(handle, dir);
18941908
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
18951909
err = ext4_handle_dirty_dirent_node(handle, dir, bh);
@@ -1908,7 +1922,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
19081922
{
19091923
struct buffer_head *bh2;
19101924
struct dx_root *root;
1911-
struct dx_frame frames[2], *frame;
1925+
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
19121926
struct dx_entry *entries;
19131927
struct ext4_dir_entry_2 *de, *de2;
19141928
struct ext4_dir_entry_tail *t;
@@ -2127,13 +2141,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
21272141
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
21282142
struct inode *dir, struct inode *inode)
21292143
{
2130-
struct dx_frame frames[2], *frame;
2144+
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
21312145
struct dx_entry *entries, *at;
21322146
struct buffer_head *bh;
21332147
struct super_block *sb = dir->i_sb;
21342148
struct ext4_dir_entry_2 *de;
2149+
int restart;
21352150
int err;
21362151

2152+
again:
2153+
restart = 0;
21372154
frame = dx_probe(fname, dir, NULL, frames);
21382155
if (IS_ERR(frame))
21392156
return PTR_ERR(frame);
@@ -2155,24 +2172,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
21552172
if (err != -ENOSPC)
21562173
goto cleanup;
21572174

2175+
err = 0;
21582176
/* Block full, should compress but for now just split */
21592177
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
21602178
dx_get_count(entries), dx_get_limit(entries)));
21612179
/* Need to split index? */
21622180
if (dx_get_count(entries) == dx_get_limit(entries)) {
21632181
ext4_lblk_t newblock;
2164-
unsigned icount = dx_get_count(entries);
2165-
int levels = frame - frames;
2182+
int levels = frame - frames + 1;
2183+
unsigned int icount;
2184+
int add_level = 1;
21662185
struct dx_entry *entries2;
21672186
struct dx_node *node2;
21682187
struct buffer_head *bh2;
21692188

2170-
if (levels && (dx_get_count(frames->entries) ==
2171-
dx_get_limit(frames->entries))) {
2172-
ext4_warning_inode(dir, "Directory index full!");
2189+
while (frame > frames) {
2190+
if (dx_get_count((frame - 1)->entries) <
2191+
dx_get_limit((frame - 1)->entries)) {
2192+
add_level = 0;
2193+
break;
2194+
}
2195+
frame--; /* split higher index block */
2196+
at = frame->at;
2197+
entries = frame->entries;
2198+
restart = 1;
2199+
}
2200+
if (add_level && levels == ext4_dir_htree_level(sb)) {
2201+
ext4_warning(sb, "Directory (ino: %lu) index full, "
2202+
"reach max htree level :%d",
2203+
dir->i_ino, levels);
2204+
if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
2205+
ext4_warning(sb, "Large directory feature is "
2206+
"not enabled on this "
2207+
"filesystem");
2208+
}
21732209
err = -ENOSPC;
21742210
goto cleanup;
21752211
}
2212+
icount = dx_get_count(entries);
21762213
bh2 = ext4_append(handle, dir, &newblock);
21772214
if (IS_ERR(bh2)) {
21782215
err = PTR_ERR(bh2);
@@ -2187,15 +2224,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
21872224
err = ext4_journal_get_write_access(handle, frame->bh);
21882225
if (err)
21892226
goto journal_error;
2190-
if (levels) {
2227+
if (!add_level) {
21912228
unsigned icount1 = icount/2, icount2 = icount - icount1;
21922229
unsigned hash2 = dx_get_hash(entries + icount1);
21932230
dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
21942231
icount1, icount2));
21952232

21962233
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
21972234
err = ext4_journal_get_write_access(handle,
2198-
frames[0].bh);
2235+
(frame - 1)->bh);
21992236
if (err)
22002237
goto journal_error;
22012238

@@ -2211,40 +2248,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
22112248
frame->entries = entries = entries2;
22122249
swap(frame->bh, bh2);
22132250
}
2214-
dx_insert_block(frames + 0, hash2, newblock);
2215-
dxtrace(dx_show_index("node", frames[1].entries));
2251+
dx_insert_block((frame - 1), hash2, newblock);
2252+
dxtrace(dx_show_index("node", frame->entries));
22162253
dxtrace(dx_show_index("node",
22172254
((struct dx_node *) bh2->b_data)->entries));
22182255
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
22192256
if (err)
22202257
goto journal_error;
22212258
brelse (bh2);
2259+
err = ext4_handle_dirty_dx_node(handle, dir,
2260+
(frame - 1)->bh);
2261+
if (err)
2262+
goto journal_error;
2263+
if (restart) {
2264+
err = ext4_handle_dirty_dx_node(handle, dir,
2265+
frame->bh);
2266+
goto journal_error;
2267+
}
22222268
} else {
2223-
dxtrace(printk(KERN_DEBUG
2224-
"Creating second level index...\n"));
2269+
struct dx_root *dxroot;
22252270
memcpy((char *) entries2, (char *) entries,
22262271
icount * sizeof(struct dx_entry));
22272272
dx_set_limit(entries2, dx_node_limit(dir));
22282273

22292274
/* Set up root */
22302275
dx_set_count(entries, 1);
22312276
dx_set_block(entries + 0, newblock);
2232-
((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
2233-
2234-
/* Add new access path frame */
2235-
frame = frames + 1;
2236-
frame->at = at = at - entries + entries2;
2237-
frame->entries = entries = entries2;
2238-
frame->bh = bh2;
2239-
err = ext4_journal_get_write_access(handle,
2240-
frame->bh);
2277+
dxroot = (struct dx_root *)frames[0].bh->b_data;
2278+
dxroot->info.indirect_levels += 1;
2279+
dxtrace(printk(KERN_DEBUG
2280+
"Creating %d level index...\n",
2281+
info->indirect_levels));
2282+
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
22412283
if (err)
22422284
goto journal_error;
2243-
}
2244-
err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
2245-
if (err) {
2246-
ext4_std_error(inode->i_sb, err);
2247-
goto cleanup;
2285+
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
2286+
brelse(bh2);
2287+
restart = 1;
2288+
goto journal_error;
22482289
}
22492290
}
22502291
de = do_split(handle, dir, &bh, frame, &fname->hinfo);
@@ -2256,10 +2297,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
22562297
goto cleanup;
22572298

22582299
journal_error:
2259-
ext4_std_error(dir->i_sb, err);
2300+
ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
22602301
cleanup:
22612302
brelse(bh);
22622303
dx_release(frames);
2304+
/* @restart is true means htree-path has been changed, we need to
2305+
* repeat dx_probe() to find out valid htree-path
2306+
*/
2307+
if (restart && err == 0)
2308+
goto again;
22632309
return err;
22642310
}
22652311

@@ -2296,7 +2342,7 @@ int ext4_generic_delete_entry(handle_t *handle,
22962342
blocksize);
22972343
else
22982344
de->inode = 0;
2299-
dir->i_version++;
2345+
inode_inc_iversion(dir);
23002346
return 0;
23012347
}
23022348
i += ext4_rec_len_from_disk(de->rec_len, blocksize);

0 commit comments

Comments
 (0)