33from torchtext .data .datasets_utils import _ParseIOBData
44from torch .utils .data .datapipes .iter import IterableWrapper
55
6+ from parameterized import parameterized
7+
68
79class TestDatasetUtils (TorchtextTestCase ):
8- def test_iob_datapipe_basic (self ):
10+ @parameterized .expand ([
11+ [lambda it : list (_ParseIOBData (IterableWrapper (it ), sep = " " ))],
12+ [lambda it : list (IterableWrapper (it ).read_iob (sep = " " ))]
13+ ])
14+ def test_iob_datapipe (self , pipe_fn ):
915 iob = [
1016 "Alex I-PER" ,
1117 "is O" ,
@@ -17,8 +23,7 @@ def test_iob_datapipe_basic(self):
1723 "California I-LOC"
1824 ]
1925 iterable = [("ignored.txt" , e ) for e in iob ]
20- iterable = IterableWrapper (iterable )
21- iob_dp = list (_ParseIOBData (iterable , sep = " " ))
26+ iob_dp = pipe_fn (iterable )
2227 # There's only one example in this dataset
2328 self .assertEqual (len (iob_dp ), 1 )
2429 # The length of the list of surface forms is the number of lines in the example
@@ -45,8 +50,7 @@ def test_iob_datapipe_basic(self):
4550 "California I-LOC" ,
4651 ]
4752 iterable = [("ignored.txt" , e ) for e in iob ]
48- iterable = IterableWrapper (iterable )
49- iob_dp = list (_ParseIOBData (iterable , sep = " " ))
53+ iob_dp = pipe_fn (iterable )
5054 # There are two examples in this dataset
5155 self .assertEqual (len (iob_dp ), 2 )
5256 # The length of the first list of surface forms is the length of everything before the empty line.
@@ -57,54 +61,3 @@ def test_iob_datapipe_basic(self):
5761 # The length of the second labels is the length of everything after the empty line.
5862 self .assertEqual (len (iob_dp [1 ][0 ]), len (iob ) - iob .index ("" ) - 1 )
5963 self .assertEqual (len (iob_dp [1 ][1 ]), len (iob ) - iob .index ("" ) - 1 )
60-
61- def test_iob_datapipe_functional (self ):
62- iob = [
63- "Alex I-PER" ,
64- "is O" ,
65- "going O" ,
66- "to O" ,
67- "Los I-LOC" ,
68- "Angeles I-LOC" ,
69- "in O" ,
70- "California I-LOC"
71- ]
72- iterable = [("ignored.txt" , e ) for e in iob ]
73- iob_dp = list (IterableWrapper (iterable ).read_iob (sep = " " ))
74- # There's only one example in this dataset
75- self .assertEqual (len (iob_dp ), 1 )
76- # The length of the list of surface forms is the number of lines in the example
77- self .assertEqual (len (iob_dp [0 ][0 ]), len (iob ))
78- # The length of the list labels is the number of lines in the example
79- self .assertEqual (len (iob_dp [0 ][1 ]), len (iob ))
80- iob = [
81- "Alex I-PER" ,
82- "is O" ,
83- "going O" ,
84- "to O" ,
85- "Los I-LOC" ,
86- "Angeles I-LOC" ,
87- "in O" ,
88- "California I-LOC" ,
89- "" ,
90- "Alex I-PER" ,
91- "is O" ,
92- "going O" ,
93- "to O" ,
94- "Los I-LOC" ,
95- "Angeles I-LOC" ,
96- "in O" ,
97- "California I-LOC" ,
98- ]
99- iterable = [("ignored.txt" , e ) for e in iob ]
100- iob_dp = list (IterableWrapper (iterable ).read_iob (sep = " " ))
101- # There's only one example in this dataset
102- self .assertEqual (len (iob_dp ), 2 )
103- # The length of the first list of surface forms is the length of everything before the empty line.
104- # The length of the first labels is the length of everything before the empty line.
105- self .assertEqual (len (iob_dp [0 ][0 ]), iob .index ("" ))
106- self .assertEqual (len (iob_dp [0 ][1 ]), iob .index ("" ))
107- # The length of the second list of surface forms is the length of everything after the empty line.
108- # The length of the second labels is the length of everything after the empty line.
109- self .assertEqual (len (iob_dp [1 ][0 ]), len (iob ) - iob .index ("" ) - 1 )
110- self .assertEqual (len (iob_dp [1 ][1 ]), len (iob ) - iob .index ("" ) - 1 )
0 commit comments