add appropriate encoding for cross-platform unicode writing.

erip · erip · commit 7049f1e3d65e · 2022-02-10T16:05:05.000-05:00
diff --git a/test/datasets/test_agnews.py b/test/datasets/test_agnews.py
@@ -20,7 +20,7 @@ def _get_mock_dataset(root_dir):
     mocked_data = defaultdict(list)
     for file_name in ("train.csv", "test.csv"):
         txt_file = os.path.join(base_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 label = seed % 4 + 1
                 rand_string = get_random_unicode(seed)
diff --git a/test/datasets/test_amazonreviewfull.py b/test/datasets/test_amazonreviewfull.py
@@ -22,7 +22,7 @@ def _get_mock_dataset(root_dir):
     mocked_data = defaultdict(list)
     for file_name in ("train.csv", "test.csv"):
         txt_file = os.path.join(temp_dataset_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 label = seed % 5 + 1
                 rand_string = get_random_unicode(seed)
diff --git a/test/datasets/test_amazonreviewpolarity.py b/test/datasets/test_amazonreviewpolarity.py
@@ -22,7 +22,7 @@ def _get_mock_dataset(root_dir):
     mocked_data = defaultdict(list)
     for file_name in ("train.csv", "test.csv"):
         txt_file = os.path.join(temp_dataset_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 label = seed % 2 + 1
                 rand_string = get_random_unicode(seed)
diff --git a/test/datasets/test_cc100.py b/test/datasets/test_cc100.py
@@ -25,7 +25,7 @@ def _get_mock_dataset(root_dir):
     for language_code in VALID_CODES:
         file_name = f"{language_code}.txt.xz"
         compressed_file = os.path.join(base_dir, file_name)
-        with lzma.open(compressed_file, "wt") as f:
+        with lzma.open(compressed_file, "wt", encoding="utf-8") as f:
             for i in range(5):
                 rand_string = get_random_unicode(seed)
                 content = f"{rand_string}\n"
diff --git a/test/datasets/test_conll2000chunking.py b/test/datasets/test_conll2000chunking.py
@@ -23,7 +23,7 @@ def _get_mock_dataset(root_dir):
     for file_name in ("train.txt", "test.txt"):
         txt_file = os.path.join(temp_dataset_dir, file_name)
         mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 rand_strings = [get_random_unicode(seed)]
                 rand_label_1 = [get_random_unicode(seed)]
diff --git a/test/datasets/test_dbpedia.py b/test/datasets/test_dbpedia.py
@@ -23,7 +23,7 @@ def _get_mock_dataset(root_dir):
     for file_name in ("train.csv", "test.csv"):
         csv_file = os.path.join(temp_dataset_dir, file_name)
         mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
-        with open(csv_file, "w") as f:
+        with open(csv_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 label = seed % 14 + 1
                 rand_string = get_random_unicode(seed)
diff --git a/test/datasets/test_enwik9.py b/test/datasets/test_enwik9.py
@@ -20,7 +20,7 @@ def _get_mock_dataset(root_dir):
     file_name = "enwik9"
     txt_file = os.path.join(temp_dataset_dir, file_name)
     mocked_data = []
-    with open(txt_file, "w") as f:
+    with open(txt_file, "w", encoding="utf-8") as f:
         for i in range(5):
             rand_string = "<" + get_random_unicode(seed) + ">"
             dataset_line = f"'{rand_string}'"
diff --git a/test/datasets/test_imdb.py b/test/datasets/test_imdb.py
@@ -32,7 +32,7 @@ def _get_mock_dataset(root_dir):
             label = "neg" if i < 2 else "pos"
             cur_dir = pos_dir if label == "pos" else neg_dir
             txt_file = os.path.join(cur_dir, f"{i}{i}_{i}.txt")
-            with open(txt_file, "w") as f:
+            with open(txt_file, "w", encoding="utf-8") as f:
                 rand_string = get_random_unicode(seed)
                 dataset_line = (label, rand_string)
                 # append line to correct dataset split
diff --git a/test/datasets/test_multi30k.py b/test/datasets/test_multi30k.py
@@ -22,7 +22,7 @@ def _get_mock_dataset(root_dir):
     mocked_data = defaultdict(list)
     for file_name in ("train.de", "train.en", "val.de", "val.en", "test.de", "test.en"):
         txt_file = os.path.join(temp_dataset_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 rand_string = get_random_unicode(seed)
                 f.write(rand_string + "\n")
diff --git a/test/datasets/test_penntreebank.py b/test/datasets/test_penntreebank.py
@@ -20,7 +20,7 @@ def _get_mock_dataset(root_dir):
     mocked_data = defaultdict(list)
     for file_name in ("ptb.train.txt", "ptb.valid.txt", "ptb.test.txt"):
         txt_file = os.path.join(base_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 rand_string = get_random_unicode(seed)
                 dataset_line = f"{rand_string}"
diff --git a/test/datasets/test_sogounews.py b/test/datasets/test_sogounews.py
@@ -22,7 +22,7 @@ def _get_mock_dataset(root_dir):
     mocked_data = defaultdict(list)
     for file_name in ("train.csv", "test.csv"):
         txt_file = os.path.join(temp_dataset_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 label = seed % 5 + 1
                 rand_string = get_random_unicode(seed)
diff --git a/test/datasets/test_squad.py b/test/datasets/test_squad.py
@@ -58,7 +58,7 @@ def _get_mock_dataset(root_dir, base_dir_name):
     mocked_data = defaultdict(list)
     for file_name in file_names:
         txt_file = os.path.join(base_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             mock_json_data = _get_mock_json_data()
             f.write(json.dumps(mock_json_data))
 
diff --git a/test/datasets/test_sst2.py b/test/datasets/test_sst2.py
@@ -25,7 +25,7 @@ def _get_mock_dataset(root_dir):
         ((("sentence", "label"), ("sentence", "label"), ("index", "sentence"))),
     ):
         txt_file = os.path.join(temp_dataset_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             f.write(f"{col1_name}\t{col2_name}\n")
             for i in range(5):
                 label = seed % 2
diff --git a/test/datasets/test_udpos.py b/test/datasets/test_udpos.py
@@ -23,7 +23,7 @@ def _get_mock_dataset(root_dir):
     for file_name in ["train.txt", "dev.txt", "test.txt"]:
         txt_file = os.path.join(temp_dataset_dir, file_name)
         mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 rand_strings = [get_random_unicode(seed)]
                 rand_label_1 = [get_random_unicode(seed)]
diff --git a/test/datasets/test_wikitexts.py b/test/datasets/test_wikitexts.py
@@ -26,7 +26,7 @@ def _get_mock_dataset(root_dir, base_dir_name):
     for file_name in file_names:
         csv_file = os.path.join(temp_dataset_dir, file_name)
         mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
-        with open(csv_file, "w") as f:
+        with open(csv_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 rand_string = get_random_unicode(seed)
                 dataset_line = rand_string
diff --git a/test/datasets/test_yahooanswers.py b/test/datasets/test_yahooanswers.py
@@ -22,7 +22,7 @@ def _get_mock_dataset(root_dir):
     mocked_data = defaultdict(list)
     for file_name in ("train.csv", "test.csv"):
         txt_file = os.path.join(temp_dataset_dir, file_name)
-        with open(txt_file, "w") as f:
+        with open(txt_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 label = seed % 10 + 1
                 rand_string = get_random_unicode(seed)
diff --git a/test/datasets/test_yelpreviewfull.py b/test/datasets/test_yelpreviewfull.py
@@ -23,7 +23,7 @@ def _get_mock_dataset(root_dir):
     for file_name in ("train.csv", "test.csv"):
         csv_file = os.path.join(temp_dataset_dir, file_name)
         mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
-        with open(csv_file, "w") as f:
+        with open(csv_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 label = seed % 5 + 1
                 rand_string = get_random_unicode(seed)
diff --git a/test/datasets/test_yelpreviews.py b/test/datasets/test_yelpreviews.py
@@ -25,7 +25,7 @@ def _get_mock_dataset(root_dir, base_dir_name):
     for file_name in ("train.csv", "test.csv"):
         csv_file = os.path.join(temp_dataset_dir, file_name)
         mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
-        with open(csv_file, "w") as f:
+        with open(csv_file, "w", encoding="utf-8") as f:
             for i in range(5):
                 if base_dir_name == YelpReviewPolarity.__name__:
                     label = seed % 2 + 1