@@ -34,6 +34,17 @@ def _clean_xml_file(f_xml):
3434
3535
3636def _clean_inner_xml_file (f_xml , base , stream ):
37+ """Accepts an XML filename within a tarball and a stream of the byte contents
38+ within that file and writes the cleaned contents to a new, untarred file
39+ found in the provided base directory.
40+
41+ Args:
42+ f_orig: the full path of the XML file in the archive
43+ base: the directory to which the new file should be written
44+ stream: the byte datapipe of the contents of f_orig
45+
46+ Returns: the path to the newly-written file
47+ """
3748 f_txt = os .path .basename (os .path .splitext (f_xml )[0 ])
3849 os .makedirs (base , exist_ok = True )
3950 out_file = os .path .join (base , f_txt )
@@ -63,6 +74,17 @@ def _clean_tags_file(f_orig):
6374
6475
6576def _clean_inner_tags_file (f_orig , base , stream ):
77+ """Accepts a tags filename within a tarball and a stream of the byte contents
78+ within that file and writes the cleaned contents to a new, untarred file
79+ found in the provided base directory.
80+
81+ Args:
82+ f_orig: the full path of the tags file in the archive
83+ base: the directory to which the new file should be written
84+ stream: the byte datapipe of the contents of f_orig
85+
86+ Returns: the path to the newly-written file
87+ """
6688 xml_tags = [
6789 '<url' , '<keywords' , '<talkid' , '<description' , '<reviewer' ,
6890 '<translator' , '<title' , '<speaker' , '<doc' , '</doc'
@@ -81,6 +103,17 @@ def _clean_inner_tags_file(f_orig, base, stream):
81103
82104
83105def _rewrite_text_file (file , base , stream ):
106+ """Accepts a text filename within a tarball and a stream of the byte contents
107+ within that file and writes the cleaned contents to a new, untarred file
108+ found in the provided base directory.
109+
110+ Args:
111+ f_orig: the full path of the text file in the archive
112+ base: the directory to which the new file should be written
113+ stream: the byte datapipe of the contents of f_orig
114+
115+ Returns: the path to the newly-written file
116+ """
84117 f_txt = os .path .basename (file )
85118 os .makedirs (base , exist_ok = True )
86119 out_file = os .path .join (base , f_txt )
@@ -89,13 +122,15 @@ def _rewrite_text_file(file, base, stream):
89122 f .write (line .decode ("utf-8" ))
90123 return out_file
91124
125+
92126def _clean_files (fname , base , stream ):
93127 if 'xml' in fname :
94128 return _clean_inner_xml_file (fname , base , stream )
95129 elif "tags" in fname :
96130 return _clean_inner_tags_file (fname , base , stream )
97131 return _rewrite_text_file (fname , base , stream )
98132
133+
99134def _create_data_from_json (data_path ):
100135 with open (data_path ) as json_file :
101136 raw_json_data = json .load (json_file )['data' ]
0 commit comments