diff --git a/test/common/case_utils.py b/test/common/case_utils.py index 9a9340fbff..4e8ae970f9 100644 --- a/test/common/case_utils.py +++ b/test/common/case_utils.py @@ -1,3 +1,4 @@ +import random import os.path import tempfile import unittest @@ -53,3 +54,27 @@ def zip_equal(*iterables): if sentinel in combo: raise ValueError("Iterables have different lengths") yield combo + + +def get_random_unicode(length): + # taken from https://stackoverflow.com/a/21666621/2883245 + + # Update this to include code point ranges to be sampled + include_ranges = [ + (0x0021, 0x0021), + (0x0023, 0x0026), + (0x0028, 0x007E), + (0x00A1, 0x00AC), + (0x00AE, 0x00FF), + (0x0100, 0x017F), + (0x0180, 0x024F), + (0x2C60, 0x2C7F), + (0x16A0, 0x16F0), + (0x0370, 0x0377), + (0x037A, 0x037E), + (0x0384, 0x038A), + (0x038C, 0x038C), + ] + + alphabet = [chr(code_point) for current_range in include_ranges for code_point in range(current_range[0], current_range[1] + 1)] + return ''.join(random.choice(alphabet) for i in range(length)) diff --git a/test/datasets/test_agnews.py b/test/datasets/test_agnews.py index f3a884d7d8..7635a018f0 100644 --- a/test/datasets/test_agnews.py +++ b/test/datasets/test_agnews.py @@ -1,13 +1,11 @@ import os -import random -import string from collections import defaultdict from unittest.mock import patch from parameterized import parameterized from torchtext.datasets.ag_news import AG_NEWS -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -22,12 +20,10 @@ def _get_mock_dataset(root_dir): mocked_data = defaultdict(list) for file_name in ("train.csv", "test.csv"): txt_file = os.path.join(base_dir, file_name) - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): label = seed % 4 + 1 - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) dataset_line = (label, f"{rand_string} {rand_string}") # append line to correct dataset split mocked_data[os.path.splitext(file_name)[0]].append(dataset_line) diff --git a/test/datasets/test_amazonreviews.py b/test/datasets/test_amazonreviews.py index 87dd6f9952..350eb0e446 100644 --- a/test/datasets/test_amazonreviews.py +++ b/test/datasets/test_amazonreviews.py @@ -1,6 +1,4 @@ import os -import random -import string import tarfile from collections import defaultdict from unittest.mock import patch @@ -8,8 +6,8 @@ from torchtext.datasets.amazonreviewfull import AmazonReviewFull from torchtext.datasets.amazonreviewpolarity import AmazonReviewPolarity -from ..common.case_utils import TempDirMixin, zip_equal from ..common.parameterized_utils import nested_params +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -26,15 +24,14 @@ def _get_mock_dataset(root_dir, base_dir_name): mocked_data = defaultdict(list) for file_name in ("train.csv", "test.csv"): txt_file = os.path.join(temp_dataset_dir, file_name) - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): if base_dir_name == AmazonReviewFull.__name__: label = seed % 5 + 1 else: label = seed % 2 + 1 - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + label = seed % 2 + 1 + rand_string = get_random_unicode(seed) dataset_line = (label, f"{rand_string} {rand_string}") # append line to correct dataset split mocked_data[os.path.splitext(file_name)[0]].append(dataset_line) diff --git a/test/datasets/test_cc100.py b/test/datasets/test_cc100.py index 978d447f67..bc21917452 100644 --- a/test/datasets/test_cc100.py +++ b/test/datasets/test_cc100.py @@ -1,7 +1,5 @@ -import lzma import os -import random -import string +import lzma from collections import defaultdict from unittest.mock import patch @@ -9,7 +7,7 @@ from torchtext.datasets import CC100 from torchtext.datasets.cc100 import VALID_CODES -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -26,11 +24,9 @@ def _get_mock_dataset(root_dir): for language_code in VALID_CODES: file_name = f"{language_code}.txt.xz" compressed_file = os.path.join(base_dir, file_name) - with lzma.open(compressed_file, "wt") as f: + with lzma.open(compressed_file, "wt", encoding="utf-8") as f: for i in range(5): - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) content = f"{rand_string}\n" f.write(content) mocked_data[language_code].append((language_code, rand_string)) diff --git a/test/datasets/test_conll2000chunking.py b/test/datasets/test_conll2000chunking.py index 29ae228ba2..a1db406077 100644 --- a/test/datasets/test_conll2000chunking.py +++ b/test/datasets/test_conll2000chunking.py @@ -1,14 +1,12 @@ -import gzip import os -import random -import string +import gzip from collections import defaultdict from unittest.mock import patch from parameterized import parameterized from torchtext.datasets.conll2000chunking import CoNLL2000Chunking -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -25,17 +23,11 @@ def _get_mock_dataset(root_dir): for file_name in ("train.txt", "test.txt"): txt_file = os.path.join(temp_dataset_dir, file_name) mocked_lines = mocked_data[os.path.splitext(file_name)[0]] - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): - rand_strings = [ - random.choice(string.ascii_letters) for i in range(seed) - ] - rand_label_1 = [ - random.choice(string.ascii_letters) for i in range(seed) - ] - rand_label_2 = [ - random.choice(string.ascii_letters) for i in range(seed) - ] + rand_strings = [get_random_unicode(seed)] + rand_label_1 = [get_random_unicode(seed)] + rand_label_2 = [get_random_unicode(seed)] # one token per line (each sample ends with an extra \n) for rand_string, label_1, label_2 in zip( rand_strings, rand_label_1, rand_label_2 diff --git a/test/datasets/test_dbpedia.py b/test/datasets/test_dbpedia.py index 51fbf19335..3f9d243a0d 100644 --- a/test/datasets/test_dbpedia.py +++ b/test/datasets/test_dbpedia.py @@ -1,6 +1,4 @@ import os -import random -import string import tarfile from collections import defaultdict from unittest.mock import patch @@ -8,7 +6,7 @@ from parameterized import parameterized from torchtext.datasets.dbpedia import DBpedia -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -25,12 +23,10 @@ def _get_mock_dataset(root_dir): for file_name in ("train.csv", "test.csv"): csv_file = os.path.join(temp_dataset_dir, file_name) mocked_lines = mocked_data[os.path.splitext(file_name)[0]] - with open(csv_file, "w") as f: + with open(csv_file, "w", encoding="utf-8") as f: for i in range(5): label = seed % 14 + 1 - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) dataset_line = (label, rand_string + " " + rand_string) f.write(f'{label},"{rand_string}","{rand_string}"\n') diff --git a/test/datasets/test_enwik9.py b/test/datasets/test_enwik9.py index 71854c4070..1eb0cd60b4 100644 --- a/test/datasets/test_enwik9.py +++ b/test/datasets/test_enwik9.py @@ -1,12 +1,10 @@ import os -import random -import string import zipfile from unittest.mock import patch from torchtext.datasets.enwik9 import EnWik9 -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -22,13 +20,9 @@ def _get_mock_dataset(root_dir): file_name = "enwik9" txt_file = os.path.join(temp_dataset_dir, file_name) mocked_data = [] - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): - rand_string = ( - "<" - + " ".join(random.choice(string.ascii_letters) for i in range(seed)) - + ">" - ) + rand_string = "<" + get_random_unicode(seed) + ">" dataset_line = f"'{rand_string}'" f.write(f"'{rand_string}'\n") diff --git a/test/datasets/test_imdb.py b/test/datasets/test_imdb.py index b9a35dbd6f..cfebfe3ebc 100644 --- a/test/datasets/test_imdb.py +++ b/test/datasets/test_imdb.py @@ -1,6 +1,4 @@ import os -import random -import string import tarfile from collections import defaultdict from unittest.mock import patch @@ -8,7 +6,7 @@ from parameterized import parameterized from torchtext.datasets.imdb import IMDB -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -34,10 +32,8 @@ def _get_mock_dataset(root_dir): label = "neg" if i < 2 else "pos" cur_dir = pos_dir if label == "pos" else neg_dir txt_file = os.path.join(cur_dir, f"{i}{i}_{i}.txt") - with open(txt_file, "w") as f: - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + with open(txt_file, "w", encoding="utf-8") as f: + rand_string = get_random_unicode(seed) dataset_line = (label, rand_string) # append line to correct dataset split mocked_data[split].append(dataset_line) diff --git a/test/datasets/test_multi30k.py b/test/datasets/test_multi30k.py index d0e9e96c04..c26fd77410 100644 --- a/test/datasets/test_multi30k.py +++ b/test/datasets/test_multi30k.py @@ -1,14 +1,12 @@ import os -import random -import string import tarfile from collections import defaultdict from unittest.mock import patch from torchtext.datasets import Multi30k -from ..common.case_utils import TempDirMixin, zip_equal from ..common.parameterized_utils import nested_params +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -24,11 +22,9 @@ def _get_mock_dataset(root_dir): mocked_data = defaultdict(list) for file_name in ("train.de", "train.en", "val.de", "val.en", "test.de", "test.en"): txt_file = os.path.join(temp_dataset_dir, file_name) - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) f.write(rand_string + "\n") mocked_data[file_name].append(rand_string) seed += 1 diff --git a/test/datasets/test_penntreebank.py b/test/datasets/test_penntreebank.py index ddb448558e..e45e2d4ca0 100644 --- a/test/datasets/test_penntreebank.py +++ b/test/datasets/test_penntreebank.py @@ -1,13 +1,11 @@ import os -import random -import string from collections import defaultdict from unittest.mock import patch from parameterized import parameterized from torchtext.datasets.penntreebank import PennTreebank -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -22,11 +20,9 @@ def _get_mock_dataset(root_dir): mocked_data = defaultdict(list) for file_name in ("ptb.train.txt", "ptb.valid.txt", "ptb.test.txt"): txt_file = os.path.join(base_dir, file_name) - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) dataset_line = f"{rand_string}" # append line to correct dataset split split = file_name.replace("ptb.", "").replace(".txt", "") diff --git a/test/datasets/test_sogounews.py b/test/datasets/test_sogounews.py index 95b53f87f1..1a9eb0c342 100644 --- a/test/datasets/test_sogounews.py +++ b/test/datasets/test_sogounews.py @@ -1,6 +1,4 @@ import os -import random -import string import tarfile from collections import defaultdict from unittest.mock import patch @@ -8,7 +6,7 @@ from parameterized import parameterized from torchtext.datasets.sogounews import SogouNews -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -24,12 +22,10 @@ def _get_mock_dataset(root_dir): mocked_data = defaultdict(list) for file_name in ("train.csv", "test.csv"): txt_file = os.path.join(temp_dataset_dir, file_name) - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): label = seed % 5 + 1 - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) dataset_line = (label, f"{rand_string} {rand_string}") # append line to correct dataset split mocked_data[os.path.splitext(file_name)[0]].append(dataset_line) diff --git a/test/datasets/test_squads.py b/test/datasets/test_squads.py index eb1abd19be..04bfc00369 100644 --- a/test/datasets/test_squads.py +++ b/test/datasets/test_squads.py @@ -1,7 +1,5 @@ import json import os -import random -import string import uuid from collections import defaultdict from random import randint @@ -11,13 +9,13 @@ from torchtext.datasets.squad1 import SQuAD1 from torchtext.datasets.squad2 import SQuAD2 -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.parameterized_utils import nested_params from ..common.torchtext_test_case import TorchtextTestCase def _get_mock_json_data(): - rand_string = " ".join(random.choice(string.ascii_letters) for i in range(10)) + rand_string = get_random_unicode(10) mock_json_data = { "data": [ { @@ -60,7 +58,7 @@ def _get_mock_dataset(root_dir, base_dir_name): mocked_data = defaultdict(list) for file_name in file_names: txt_file = os.path.join(base_dir, file_name) - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: mock_json_data = _get_mock_json_data() f.write(json.dumps(mock_json_data)) diff --git a/test/datasets/test_sst2.py b/test/datasets/test_sst2.py index 29fdb6fbed..72985002a8 100644 --- a/test/datasets/test_sst2.py +++ b/test/datasets/test_sst2.py @@ -1,6 +1,4 @@ import os -import random -import string import zipfile from collections import defaultdict from unittest.mock import patch @@ -8,7 +6,7 @@ from parameterized import parameterized from torchtext.datasets.sst2 import SST2 -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -27,13 +25,11 @@ def _get_mock_dataset(root_dir): ((("sentence", "label"), ("sentence", "label"), ("index", "sentence"))), ): txt_file = os.path.join(temp_dataset_dir, file_name) - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: f.write(f"{col1_name}\t{col2_name}\n") for i in range(5): label = seed % 2 - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) if file_name == "test.tsv": dataset_line = (f"{rand_string} .",) f.write(f"{i}\t{rand_string} .\n") diff --git a/test/datasets/test_udpos.py b/test/datasets/test_udpos.py index 455a7cc019..51561aea96 100644 --- a/test/datasets/test_udpos.py +++ b/test/datasets/test_udpos.py @@ -1,6 +1,4 @@ import os -import random -import string import zipfile from collections import defaultdict from unittest.mock import patch @@ -8,7 +6,7 @@ from parameterized import parameterized from torchtext.datasets.udpos import UDPOS -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -25,18 +23,11 @@ def _get_mock_dataset(root_dir): for file_name in ["train.txt", "dev.txt", "test.txt"]: txt_file = os.path.join(temp_dataset_dir, file_name) mocked_lines = mocked_data[os.path.splitext(file_name)[0]] - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): - rand_strings = [ - "".join(random.sample(string.ascii_letters, random.randint(1, 10))) - for i in range(seed) - ] - rand_label_1 = [ - random.choice(string.ascii_letters) for i in range(seed) - ] - rand_label_2 = [ - random.choice(string.ascii_letters) for i in range(seed) - ] + rand_strings = [get_random_unicode(seed)] + rand_label_1 = [get_random_unicode(seed)] + rand_label_2 = [get_random_unicode(seed)] # one token per line (each sample ends with an extra \n) for rand_string, label_1, label_2 in zip( rand_strings, rand_label_1, rand_label_2 diff --git a/test/datasets/test_wikitexts.py b/test/datasets/test_wikitexts.py index 36c26db027..fc938b4e68 100644 --- a/test/datasets/test_wikitexts.py +++ b/test/datasets/test_wikitexts.py @@ -1,6 +1,4 @@ import os -import random -import string import zipfile from collections import defaultdict from unittest.mock import patch @@ -8,8 +6,8 @@ from torchtext.datasets.wikitext103 import WikiText103 from torchtext.datasets.wikitext2 import WikiText2 -from ..common.case_utils import TempDirMixin, zip_equal from ..common.parameterized_utils import nested_params +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -28,11 +26,9 @@ def _get_mock_dataset(root_dir, base_dir_name): for file_name in file_names: csv_file = os.path.join(temp_dataset_dir, file_name) mocked_lines = mocked_data[os.path.splitext(file_name)[0]] - with open(csv_file, "w") as f: + with open(csv_file, "w", encoding="utf-8") as f: for i in range(5): - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) dataset_line = rand_string f.write(f"{rand_string}\n") diff --git a/test/datasets/test_yahooanswers.py b/test/datasets/test_yahooanswers.py index 30f537c28a..f846624969 100644 --- a/test/datasets/test_yahooanswers.py +++ b/test/datasets/test_yahooanswers.py @@ -1,6 +1,4 @@ import os -import random -import string import tarfile from collections import defaultdict from unittest.mock import patch @@ -8,7 +6,7 @@ from parameterized import parameterized from torchtext.datasets.yahooanswers import YahooAnswers -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -24,12 +22,10 @@ def _get_mock_dataset(root_dir): mocked_data = defaultdict(list) for file_name in ("train.csv", "test.csv"): txt_file = os.path.join(temp_dataset_dir, file_name) - with open(txt_file, "w") as f: + with open(txt_file, "w", encoding="utf-8") as f: for i in range(5): label = seed % 10 + 1 - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) dataset_line = (label, f"{rand_string} {rand_string} {rand_string}") # append line to correct dataset split mocked_data[os.path.splitext(file_name)[0]].append(dataset_line) diff --git a/test/datasets/test_yelpreviews.py b/test/datasets/test_yelpreviews.py index 3f4ccde54f..303de92389 100644 --- a/test/datasets/test_yelpreviews.py +++ b/test/datasets/test_yelpreviews.py @@ -1,6 +1,4 @@ import os -import random -import string import tarfile from collections import defaultdict from unittest.mock import patch @@ -8,8 +6,8 @@ from torchtext.datasets.yelpreviewfull import YelpReviewFull from torchtext.datasets.yelpreviewpolarity import YelpReviewPolarity -from ..common.case_utils import TempDirMixin, zip_equal from ..common.parameterized_utils import nested_params +from ..common.case_utils import TempDirMixin, zip_equal, get_random_unicode from ..common.torchtext_test_case import TorchtextTestCase @@ -27,15 +25,13 @@ def _get_mock_dataset(root_dir, base_dir_name): for file_name in ("train.csv", "test.csv"): csv_file = os.path.join(temp_dataset_dir, file_name) mocked_lines = mocked_data[os.path.splitext(file_name)[0]] - with open(csv_file, "w") as f: + with open(csv_file, "w", encoding="utf-8") as f: for i in range(5): if base_dir_name == YelpReviewPolarity.__name__: label = seed % 2 + 1 else: label = seed % 5 + 1 - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) + rand_string = get_random_unicode(seed) dataset_line = (label, f"{rand_string}") f.write(f'"{label}","{rand_string}"\n') diff --git a/torchtext/datasets/ag_news.py b/torchtext/datasets/ag_news.py index 09d606c04f..1274198629 100644 --- a/torchtext/datasets/ag_news.py +++ b/torchtext/datasets/ag_news.py @@ -48,5 +48,6 @@ def AG_NEWS(root: str, split: Union[Tuple[str], str]): cache_dp = HttpReader(cache_dp) cache_dp = cache_dp.end_caching(mode="wb", same_filepath_fn=True) - data_dp = FileOpener(cache_dp, mode="r") + # TODO: read in text mode with utf-8 encoding, see: https://github.com/pytorch/pytorch/issues/72713 + data_dp = FileOpener(cache_dp, mode="b") return data_dp.parse_csv().map(fn=lambda t: (int(t[0]), " ".join(t[1:]))) diff --git a/torchtext/datasets/cc100.py b/torchtext/datasets/cc100.py index 8949f30b0c..5d67dc3237 100644 --- a/torchtext/datasets/cc100.py +++ b/torchtext/datasets/cc100.py @@ -50,5 +50,6 @@ def CC100(root: str, language_code: str = "en"): cache_decompressed_dp = FileOpener(cache_decompressed_dp, mode="b").read_from_xz() cache_decompressed_dp = cache_decompressed_dp.end_caching(mode="wb") - data_dp = FileOpener(cache_decompressed_dp, mode="r").readlines(return_path=False) + # TODO: read in text mode with utf-8 encoding, see: https://github.com/pytorch/pytorch/issues/72713 + data_dp = FileOpener(cache_decompressed_dp, mode="b").readlines(return_path=False, decode=True) return data_dp.map(lambda x: (language_code, x)) diff --git a/torchtext/datasets/imdb.py b/torchtext/datasets/imdb.py index 028d9a82f7..27dc9890de 100644 --- a/torchtext/datasets/imdb.py +++ b/torchtext/datasets/imdb.py @@ -72,11 +72,14 @@ def filter_imdb_data(key, fname): cache_decompressed_dp = ( cache_decompressed_dp.lines_to_paragraphs() ) # group by label in cache file + cache_decompressed_dp = cache_decompressed_dp.map(lambda x: (x[0], x[1].encode())) cache_decompressed_dp = cache_decompressed_dp.end_caching( - mode="wt", + mode="wb", filepath_fn=lambda x: os.path.join(root, decompressed_folder, split, x), + skip_read=True ) - data_dp = FileOpener(cache_decompressed_dp, mode="t") + # TODO: read in text mode with utf-8 encoding, see: https://github.com/pytorch/pytorch/issues/72713 + data_dp = FileOpener(cache_decompressed_dp, mode="b") # get label from cache file, eg. "aclImdb_v1/train/neg" -> "neg" - return data_dp.readlines().map(lambda t: (Path(t[0]).parts[-1], t[1])) + return data_dp.readlines(decode=True).map(lambda t: (Path(t[0]).parts[-1], t[1])) diff --git a/torchtext/datasets/iwslt2016.py b/torchtext/datasets/iwslt2016.py index 7123d68449..f421b9ea6e 100644 --- a/torchtext/datasets/iwslt2016.py +++ b/torchtext/datasets/iwslt2016.py @@ -327,10 +327,11 @@ def IWSLT2016( cache_decompressed_dp, full_tgt_filepath, uncleaned_tgt_filename ) - tgt_data_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="r") - src_data_dp = FileOpener(cache_inner_src_decompressed_dp, mode="r") + # TODO: read in text mode with utf-8 encoding, see: https://github.com/pytorch/pytorch/issues/72713 + tgt_data_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="b") + src_data_dp = FileOpener(cache_inner_src_decompressed_dp, mode="b") - src_lines = src_data_dp.readlines(return_path=False, strip_newline=False) - tgt_lines = tgt_data_dp.readlines(return_path=False, strip_newline=False) + src_lines = src_data_dp.readlines(return_path=False, strip_newline=False, decode=True) + tgt_lines = tgt_data_dp.readlines(return_path=False, strip_newline=False, decode=True) return src_lines.zip(tgt_lines) diff --git a/torchtext/datasets/iwslt2017.py b/torchtext/datasets/iwslt2017.py index e08fbf55bb..db3b33b926 100644 --- a/torchtext/datasets/iwslt2017.py +++ b/torchtext/datasets/iwslt2017.py @@ -265,10 +265,11 @@ def IWSLT2017( cache_decompressed_dp, full_tgt_filepath, uncleaned_tgt_filename ) - tgt_data_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="r") - src_data_dp = FileOpener(cache_inner_src_decompressed_dp, mode="r") + # TODO: read in text mode with utf-8 encoding, see: https://github.com/pytorch/pytorch/issues/72713 + tgt_data_dp = FileOpener(cache_inner_tgt_decompressed_dp, mode="b") + src_data_dp = FileOpener(cache_inner_src_decompressed_dp, mode="b") - src_lines = src_data_dp.readlines(return_path=False, strip_newline=False) - tgt_lines = tgt_data_dp.readlines(return_path=False, strip_newline=False) + src_lines = src_data_dp.readlines(return_path=False, strip_newline=False, decode=True) + tgt_lines = tgt_data_dp.readlines(return_path=False, strip_newline=False, decode=True) return src_lines.zip(tgt_lines) diff --git a/torchtext/datasets/penntreebank.py b/torchtext/datasets/penntreebank.py index 6727d99ce7..09ca6884d2 100644 --- a/torchtext/datasets/penntreebank.py +++ b/torchtext/datasets/penntreebank.py @@ -49,6 +49,8 @@ def PennTreebank(root, split: Union[Tuple[str], str]): hash_type="md5", ) cache_dp = HttpReader(cache_dp).end_caching(mode="wb", same_filepath_fn=True) - data_dp = FileOpener(cache_dp, mode="r") + + # TODO: read in text mode with utf-8 encoding, see: https://github.com/pytorch/pytorch/issues/72713 + data_dp = FileOpener(cache_dp, mode="b") # remove single leading and trailing space from the dataset - return data_dp.readlines(return_path=False).map(lambda t: t.strip()) + return data_dp.readlines(return_path=False, decode=True).map(lambda t: t.strip())