Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit b668624

Browse files
committed
add docstring for new functions.
1 parent a2dc38e commit b668624

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

torchtext/data/datasets_utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ def _clean_xml_file(f_xml):
3434

3535

3636
def _clean_inner_xml_file(f_xml, base, stream):
37+
"""Accepts an XML filename within a tarball and a stream of the byte contents
38+
within that file and writes the cleaned contents to a new, untarred file
39+
found in the provided base directory.
40+
41+
Args:
42+
f_orig: the full path of the XML file in the archive
43+
base: the directory to which the new file should be written
44+
stream: the byte datapipe of the contents of f_orig
45+
46+
Returns: the path to the newly-written file
47+
"""
3748
f_txt = os.path.basename(os.path.splitext(f_xml)[0])
3849
os.makedirs(base, exist_ok=True)
3950
out_file = os.path.join(base, f_txt)
@@ -63,6 +74,17 @@ def _clean_tags_file(f_orig):
6374

6475

6576
def _clean_inner_tags_file(f_orig, base, stream):
77+
"""Accepts a tags filename within a tarball and a stream of the byte contents
78+
within that file and writes the cleaned contents to a new, untarred file
79+
found in the provided base directory.
80+
81+
Args:
82+
f_orig: the full path of the tags file in the archive
83+
base: the directory to which the new file should be written
84+
stream: the byte datapipe of the contents of f_orig
85+
86+
Returns: the path to the newly-written file
87+
"""
6688
xml_tags = [
6789
'<url', '<keywords', '<talkid', '<description', '<reviewer',
6890
'<translator', '<title', '<speaker', '<doc', '</doc'
@@ -81,6 +103,17 @@ def _clean_inner_tags_file(f_orig, base, stream):
81103

82104

83105
def _rewrite_text_file(file, base, stream):
106+
"""Accepts a text filename within a tarball and a stream of the byte contents
107+
within that file and writes the cleaned contents to a new, untarred file
108+
found in the provided base directory.
109+
110+
Args:
111+
f_orig: the full path of the text file in the archive
112+
base: the directory to which the new file should be written
113+
stream: the byte datapipe of the contents of f_orig
114+
115+
Returns: the path to the newly-written file
116+
"""
84117
f_txt = os.path.basename(file)
85118
os.makedirs(base, exist_ok=True)
86119
out_file = os.path.join(base, f_txt)
@@ -89,13 +122,15 @@ def _rewrite_text_file(file, base, stream):
89122
f.write(line.decode("utf-8"))
90123
return out_file
91124

125+
92126
def _clean_files(fname, base, stream):
93127
if 'xml' in fname:
94128
return _clean_inner_xml_file(fname, base, stream)
95129
elif "tags" in fname:
96130
return _clean_inner_tags_file(fname, base, stream)
97131
return _rewrite_text_file(fname, base, stream)
98132

133+
99134
def _create_data_from_json(data_path):
100135
with open(data_path) as json_file:
101136
raw_json_data = json.load(json_file)['data']

0 commit comments

Comments
 (0)