@@ -33,6 +33,18 @@ def _clean_xml_file(f_xml):
3333 fd_txt .write (e .text .strip () + '\n ' )
3434
3535
36+ def _clean_inner_xml_file (f_xml , base , stream ):
37+ f_txt = os .path .basename (os .path .splitext (f_xml )[0 ])
38+ os .makedirs (base , exist_ok = True )
39+ out_file = os .path .join (base , f_txt )
40+ with codecs .open (out_file , mode = 'w' , encoding = 'utf-8' ) as fd_txt :
41+ root = ET .fromstring (stream .read ().decode ("utf-8" ))[0 ]
42+ for doc in root .findall ('doc' ):
43+ for e in doc .findall ('seg' ):
44+ fd_txt .write (e .text .strip () + '\n ' )
45+ return os .path .join (base , f_txt )
46+
47+
3648def _clean_tags_file (f_orig ):
3749 xml_tags = [
3850 '<url' , '<keywords' , '<talkid' , '<description' , '<reviewer' ,
@@ -50,6 +62,34 @@ def _clean_tags_file(f_orig):
5062 fd_txt .write (line .strip () + '\n ' )
5163
5264
65+ def _clean_inner_tags_file (f_orig , base , stream ):
66+ xml_tags = [
67+ '<url' , '<keywords' , '<talkid' , '<description' , '<reviewer' ,
68+ '<translator' , '<title' , '<speaker' , '<doc' , '</doc'
69+ ]
70+ f_txt = os .path .basename (f_orig .replace ('.tags' , '' ))
71+ os .makedirs (base , exist_ok = True )
72+ with codecs .open (f_txt , mode = 'w' , encoding = 'utf-8' ) as fd_txt :
73+ for line in stream .readlines ():
74+ if not any (tag in line .decode ("utf-8" ) for tag in xml_tags ):
75+ # TODO: Fix utf-8 next line mark
76+ # fd_txt.write(l.strip() + '\n')
77+ # fd_txt.write(l.strip() + u"\u0085")
78+ # fd_txt.write(l.lstrip())
79+ fd_txt .write (line .decode ("utf-8" ).strip () + '\n ' )
80+ return f_txt
81+
82+
83+ def _rewrite_text_file (file , base , stream ):
84+ f_txt = os .path .basename (file )
85+ os .makedirs (base , exist_ok = True )
86+ out_file = os .path .join (base , f_txt )
87+ with open (out_file , 'w' , encoding = 'utf-8' ) as f :
88+ for line in stream .readlines ():
89+ f .write (line .decode ("utf-8" ))
90+ return out_file
91+
92+
5393def _create_data_from_json (data_path ):
5494 with open (data_path ) as json_file :
5595 raw_json_data = json .load (json_file )['data' ]
0 commit comments