Skip to content

Commit ebe1d1f

Browse files
authored
HADOOP-17362. reduce RPC calls doing ls on HAR file (#2444). Contributed by Daryn Sharp and Ahmed Hussein
1 parent f56cd88 commit ebe1d1f

File tree

3 files changed

+38
-40
lines changed

3 files changed

+38
-40
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import java.net.URISyntaxException;
3636
import java.net.URLDecoder;
3737
import java.util.*;
38+
import java.util.concurrent.ConcurrentHashMap;
3839

3940
import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
4041

@@ -513,41 +514,22 @@ private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses)
513514
if (!parentString.endsWith(Path.SEPARATOR)){
514515
parentString += Path.SEPARATOR;
515516
}
516-
Path harPath = new Path(parentString);
517-
int harlen = harPath.depth();
518-
final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
519-
520-
for (HarStatus hstatus : metadata.archive.values()) {
521-
String child = hstatus.getName();
522-
if ((child.startsWith(parentString))) {
523-
Path thisPath = new Path(child);
524-
if (thisPath.depth() == harlen + 1) {
525-
statuses.add(toFileStatus(hstatus, cache));
526-
}
527-
}
517+
518+
for (String child: parent.children) {
519+
Path p = new Path(parentString + child);
520+
statuses.add(toFileStatus(metadata.archive.get(p)));
528521
}
529522
}
530523

531524
/**
532525
* Combine the status stored in the index and the underlying status.
533526
* @param h status stored in the index
534-
* @param cache caching the underlying file statuses
535527
* @return the combined file status
536528
* @throws IOException
537529
*/
538-
private FileStatus toFileStatus(HarStatus h,
539-
Map<String, FileStatus> cache) throws IOException {
540-
FileStatus underlying = null;
541-
if (cache != null) {
542-
underlying = cache.get(h.partName);
543-
}
544-
if (underlying == null) {
545-
final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
546-
underlying = fs.getFileStatus(p);
547-
if (cache != null) {
548-
cache.put(h.partName, underlying);
549-
}
550-
}
530+
private FileStatus toFileStatus(HarStatus h) throws IOException {
531+
final Path p = h.isDir ? archivePath : new Path(archivePath, h.partName);
532+
FileStatus underlying = metadata.getPartFileStatus(p);
551533

552534
long modTime = 0;
553535
int version = metadata.getVersion();
@@ -658,7 +640,7 @@ public long getModificationTime() {
658640
@Override
659641
public FileStatus getFileStatus(Path f) throws IOException {
660642
HarStatus hstatus = getFileHarStatus(f);
661-
return toFileStatus(hstatus, null);
643+
return toFileStatus(hstatus);
662644
}
663645

664646
private HarStatus getFileHarStatus(Path f) throws IOException {
@@ -815,7 +797,7 @@ public FileStatus[] listStatus(Path f) throws IOException {
815797
if (hstatus.isDir()) {
816798
fileStatusesInIndex(hstatus, statuses);
817799
} else {
818-
statuses.add(toFileStatus(hstatus, null));
800+
statuses.add(toFileStatus(hstatus));
819801
}
820802

821803
return statuses.toArray(new FileStatus[statuses.size()]);
@@ -1143,24 +1125,32 @@ private class HarMetaData {
11431125

11441126
List<Store> stores = new ArrayList<Store>();
11451127
Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
1146-
private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
1128+
// keys are always the internal har path.
1129+
private Map<Path, FileStatus> partFileStatuses = new ConcurrentHashMap<>();
11471130

11481131
public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
11491132
this.fs = fs;
11501133
this.masterIndexPath = masterIndexPath;
11511134
this.archiveIndexPath = archiveIndexPath;
11521135
}
11531136

1154-
public FileStatus getPartFileStatus(Path partPath) throws IOException {
1137+
public FileStatus getPartFileStatus(Path path) throws IOException {
1138+
Path partPath = getPathInHar(path);
11551139
FileStatus status;
11561140
status = partFileStatuses.get(partPath);
11571141
if (status == null) {
1158-
status = fs.getFileStatus(partPath);
1142+
status = fs.getFileStatus(path);
11591143
partFileStatuses.put(partPath, status);
11601144
}
11611145
return status;
11621146
}
11631147

1148+
private void addPartFileStatuses(Path path) throws IOException {
1149+
for (FileStatus stat : fs.listStatus(path)) {
1150+
partFileStatuses.put(getPathInHar(stat.getPath()), stat);
1151+
}
1152+
}
1153+
11641154
public long getMasterIndexTimestamp() {
11651155
return masterIndexTimestamp;
11661156
}
@@ -1217,16 +1207,22 @@ private void parseMetaData() throws IOException {
12171207
try {
12181208
FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
12191209
archiveIndexTimestamp = archiveStat.getModificationTime();
1220-
LineReader aLin;
1210+
1211+
// pre-populate part cache.
1212+
addPartFileStatuses(archiveIndexPath.getParent());
1213+
LineReader aLin = null;
12211214

12221215
// now start reading the real index file
1216+
long pos = -1;
12231217
for (Store s: stores) {
1224-
read = 0;
1225-
aIn.seek(s.begin);
1226-
aLin = new LineReader(aIn, getConf());
1227-
while (read + s.begin < s.end) {
1228-
int tmp = aLin.readLine(line);
1229-
read += tmp;
1218+
if (pos != s.begin) {
1219+
pos = s.begin;
1220+
aIn.seek(s.begin);
1221+
aLin = new LineReader(aIn, getConf());
1222+
}
1223+
1224+
while (pos < s.end) {
1225+
pos += aLin.readLine(line);
12301226
String lineFeed = line.toString();
12311227
String[] parsed = lineFeed.split(" ");
12321228
parsed[0] = decodeFileName(parsed[0]);

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
import java.util.Iterator;
4242
import java.util.List;
4343
import java.util.Map;
44-
import java.util.Set;
4544
import java.util.concurrent.CompletableFuture;
4645

4746
import static org.apache.hadoop.fs.Options.ChecksumOpt;

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@
3333
import java.util.HashSet;
3434
import java.util.Set;
3535

36-
import static org.junit.Assert.*;
36+
import static org.junit.Assert.assertEquals;
37+
import static org.junit.Assert.assertFalse;
38+
import static org.junit.Assert.assertTrue;
39+
3740

3841
/**
3942
* This test class checks basic operations with {@link HarFileSystem} including

0 commit comments

Comments
 (0)