Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit ec18b0f

Browse files
committed
add tests and documentation.
1 parent aaa5a6f commit ec18b0f

File tree

2 files changed

+115
-1
lines changed

2 files changed

+115
-1
lines changed

test/data/test_dataset_utils.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
from ..common.torchtext_test_case import TorchtextTestCase
2+
3+
from torchtext.data.datasets_utils import IOBDataPipe
4+
from torch.utils.data.datapipes.iter import IterableWrapper
5+
6+
7+
class TestDatasetUtils(TorchtextTestCase):
8+
def test_iob_datapipe_basic(self):
9+
iob = [
10+
"Alex I-PER",
11+
"is O",
12+
"going O",
13+
"to O",
14+
"Los I-LOC",
15+
"Angeles I-LOC",
16+
"in O",
17+
"California I-LOC"
18+
]
19+
iterable = [("ignored.txt", e) for e in iob]
20+
iterable = IterableWrapper(iterable)
21+
iob_dp = list(IOBDataPipe(iterable, sep=" "))
22+
# There's only one example in this dataset
23+
self.assertEqual(len(iob_dp), 1)
24+
# The length of the list of surface forms is the number of lines in the example
25+
self.assertEqual(len(iob_dp[0][0]), len(iob))
26+
# The length of the list labels is the number of lines in the example
27+
self.assertEqual(len(iob_dp[0][1]), len(iob))
28+
iob = [
29+
"Alex I-PER",
30+
"is O",
31+
"going O",
32+
"to O",
33+
"Los I-LOC",
34+
"Angeles I-LOC",
35+
"in O",
36+
"California I-LOC",
37+
"",
38+
"Alex I-PER",
39+
"is O",
40+
"going O",
41+
"to O",
42+
"Los I-LOC",
43+
"Angeles I-LOC",
44+
"in O",
45+
"California I-LOC",
46+
]
47+
iterable = [("ignored.txt", e) for e in iob]
48+
iterable = IterableWrapper(iterable)
49+
iob_dp = list(IOBDataPipe(iterable, sep=" "))
50+
# There's only one example in this dataset
51+
self.assertEqual(len(iob_dp), 2)
52+
# The length of the first list of surface forms is the length of everything before the empty line.
53+
# The length of the first labels is the length of everything before the empty line.
54+
self.assertEqual(len(iob_dp[0][0]), iob.index(""))
55+
self.assertEqual(len(iob_dp[0][1]), iob.index(""))
56+
# The length of the first list of surface forms is the length of everything before the empty line.
57+
# The length of the first labels is the length of everything before the empty line.
58+
self.assertEqual(len(iob_dp[1][0]), len(iob) - iob.index("") - 1)
59+
self.assertEqual(len(iob_dp[1][1]), len(iob) - iob.index("") - 1)
60+
61+
def test_iob_datapipe_functional(self):
62+
iob = [
63+
"Alex I-PER",
64+
"is O",
65+
"going O",
66+
"to O",
67+
"Los I-LOC",
68+
"Angeles I-LOC",
69+
"in O",
70+
"California I-LOC"
71+
]
72+
iterable = [("ignored.txt", e) for e in iob]
73+
iob_dp = list(IterableWrapper(iterable).read_iob(sep=" "))
74+
# There's only one example in this dataset
75+
self.assertEqual(len(iob_dp), 1)
76+
# The length of the list of surface forms is the number of lines in the example
77+
self.assertEqual(len(iob_dp[0][0]), len(iob))
78+
# The length of the list labels is the number of lines in the example
79+
self.assertEqual(len(iob_dp[0][1]), len(iob))
80+
iob = [
81+
"Alex I-PER",
82+
"is O",
83+
"going O",
84+
"to O",
85+
"Los I-LOC",
86+
"Angeles I-LOC",
87+
"in O",
88+
"California I-LOC",
89+
"",
90+
"Alex I-PER",
91+
"is O",
92+
"going O",
93+
"to O",
94+
"Los I-LOC",
95+
"Angeles I-LOC",
96+
"in O",
97+
"California I-LOC",
98+
]
99+
iterable = [("ignored.txt", e) for e in iob]
100+
iob_dp = list(IterableWrapper(iterable).read_iob(sep=" "))
101+
# There's only one example in this dataset
102+
self.assertEqual(len(iob_dp), 2)
103+
# The length of the first list of surface forms is the length of everything before the empty line.
104+
# The length of the first labels is the length of everything before the empty line.
105+
self.assertEqual(len(iob_dp[0][0]), iob.index(""))
106+
self.assertEqual(len(iob_dp[0][1]), iob.index(""))
107+
# The length of the first list of surface forms is the length of everything before the empty line.
108+
# The length of the first labels is the length of everything before the empty line.
109+
self.assertEqual(len(iob_dp[1][0]), len(iob) - iob.index("") - 1)
110+
self.assertEqual(len(iob_dp[1][1]), len(iob) - iob.index("") - 1)
111+
112+

torchtext/data/datasets_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,9 @@ def __str__(self):
323323

324324
@functional_datapipe("read_iob")
325325
class IOBDataPipe(IterDataPipe):
326+
"""A datapipe responsible for reading sep-delimited IOB data from a stream.
327+
328+
Used for CONLL 2000 and UDPOS."""
326329
def __init__(self, dp, sep: str = "\t") -> None:
327330
self.dp = dp
328331
self.sep = sep
@@ -336,7 +339,6 @@ def __iter__(self):
336339
yield columns
337340
columns = []
338341
else:
339-
print(line)
340342
for i, column in enumerate(line.split(self.sep)):
341343
if len(columns) < i + 1:
342344
columns.append([])

0 commit comments

Comments
 (0)