Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 05ec62f

Browse files
committed
add inner-tar cleaners.
1 parent 8b3dbc0 commit 05ec62f

File tree

1 file changed

+40
-0
lines changed

1 file changed

+40
-0
lines changed

torchtext/data/datasets_utils.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,18 @@ def _clean_xml_file(f_xml):
3333
fd_txt.write(e.text.strip() + '\n')
3434

3535

36+
def _clean_inner_xml_file(f_xml, base, stream):
37+
f_txt = os.path.basename(os.path.splitext(f_xml)[0])
38+
os.makedirs(base, exist_ok=True)
39+
out_file = os.path.join(base, f_txt)
40+
with codecs.open(out_file, mode='w', encoding='utf-8') as fd_txt:
41+
root = ET.fromstring(stream.read().decode("utf-8"))[0]
42+
for doc in root.findall('doc'):
43+
for e in doc.findall('seg'):
44+
fd_txt.write(e.text.strip() + '\n')
45+
return os.path.join(base, f_txt)
46+
47+
3648
def _clean_tags_file(f_orig):
3749
xml_tags = [
3850
'<url', '<keywords', '<talkid', '<description', '<reviewer',
@@ -50,6 +62,34 @@ def _clean_tags_file(f_orig):
5062
fd_txt.write(line.strip() + '\n')
5163

5264

65+
def _clean_inner_tags_file(f_orig, base, stream):
66+
xml_tags = [
67+
'<url', '<keywords', '<talkid', '<description', '<reviewer',
68+
'<translator', '<title', '<speaker', '<doc', '</doc'
69+
]
70+
f_txt = os.path.basename(f_orig.replace('.tags', ''))
71+
os.makedirs(base, exist_ok=True)
72+
with codecs.open(f_txt, mode='w', encoding='utf-8') as fd_txt:
73+
for line in stream.readlines():
74+
if not any(tag in line.decode("utf-8") for tag in xml_tags):
75+
# TODO: Fix utf-8 next line mark
76+
# fd_txt.write(l.strip() + '\n')
77+
# fd_txt.write(l.strip() + u"\u0085")
78+
# fd_txt.write(l.lstrip())
79+
fd_txt.write(line.decode("utf-8").strip() + '\n')
80+
return f_txt
81+
82+
83+
def _rewrite_text_file(file, base, stream):
84+
f_txt = os.path.basename(file)
85+
os.makedirs(base, exist_ok=True)
86+
out_file = os.path.join(base, f_txt)
87+
with open(out_file, 'w', encoding='utf-8') as f:
88+
for line in stream.readlines():
89+
f.write(line.decode("utf-8"))
90+
return out_file
91+
92+
5393
def _create_data_from_json(data_path):
5494
with open(data_path) as json_file:
5595
raw_json_data = json.load(json_file)['data']

0 commit comments

Comments
 (0)