Update base for Update on "create T5MultiheadAttention module"

pmabbo13 · pmabbo13 · commit c60704ac0145 · 2022-07-18T11:08:50.000-04:00
# Description Add T5 architecture to torchtext # Process The T5 architecture is very similar to the architecture of a traditional transformer. The main differences are that rather than using positional embeddings, it computes a relative attention bias that encodes the relative position of a token within a sequence. This position bias is then passed into each layer and used to compute the attention scores. T5 also uses a simplified layer normalization (root mean square normalization) which occurs at the start of every attention and feed-forward block. Incorporating relative attention bias requires under the hood changes to the MultiHeadAttention module. We can use HF's implementation for computing relative attention bias and modify the source code for torch.nn.MultiHeadAttention to incorporate relative attention bias. We can also create our own layer normalization, similarly to HF. Given the above components, we can then define our own T5Layer, T5Stack, and T5Model. * The T5Layer can be used either as an encoder layer or decoder layer based on an input boolean parameter. The only difference between the decoder layer versus the encoder layer is that the decoder layer also performs cross-attention with the encoder output. * T5Stack can also be used as either an encoder or decoder based on an input boolean parameter. This dictates which type of layer composes the stack. * T5Model can be used either as an encoder-only or encoder-decoder model based on an input boolean parameter. If it is an encoder-decoder model, a causal mask is generated for the decoder input tokens. # Testing not yet implemented # Stack WIP PR where implementation details were discussed: #1812 [ghstack-poisoned]
diff --git a/.circleci/config.yml b/.circleci/config.yml
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
@@ -57,6 +57,7 @@ binary_common: &binary_common
     BUILD_VERSION: << parameters.build_version >>
     PYTORCH_VERSION: << parameters.pytorch_version >>
     CU_VERSION: cpu
+    MACOSX_DEPLOYMENT_TARGET: 10.9
 
 smoke_test_common: &smoke_test_common
   <<: *binary_common
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,8 +13,6 @@ dependencies:
       - pytest-pythonpath
       - sacremoses
       - spacy
-      - sphinx
-      - sphinx-rtd-theme
       - tqdm
       - expecttest
       - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
@@ -14,11 +14,8 @@ dependencies:
       - pytest-pythonpath
       - sacremoses
       - spacy
-      - sphinx
-      - sphinx-rtd-theme
       - tqdm
       - certifi
-      - future
       - expecttest
       - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
       - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0
diff --git a/benchmark/benchmark_torcharrow_ops.py b/benchmark/benchmark_torcharrow_ops.py
@@ -1,6 +1,7 @@
 import sys, os
 
 import torcharrow as ta
+import torcharrow.pytorch as tap
 import torchtext.transforms as T
 from benchmark.utils import Timer
 from torcharrow import functional as ta_F
@@ -26,6 +27,7 @@ def run_torchtext_ops():
     add_eos_str = T.AddToken(token="<eros>", begin=False)
     add_bos_int = T.AddToken(token=0, begin=True)
     add_eos_int = T.AddToken(token=-1, begin=False)
+    convert_to_tensor = T.ToTensor(padding_value=1)
 
     # dataset
     train_dp = SST2(split="train")
@@ -45,6 +47,9 @@ def run_torchtext_ops():
         add_bos_int(token_ids)
         add_eos_int(token_ids)
 
+    with Timer("Running torchtext's to tensor conversion"):
+        convert_to_tensor(token_ids)
+
 
 def run_torcharrow_ops():
     # tokenizer converting text into tokens
@@ -56,7 +61,8 @@ def run_torcharrow_ops():
     # dataset
     train_dp = SST2(split="train")
     text_list = list(train_dp.map(lambda x: x[0]))
-    data_frame = ta.dataframe({"text": text_list})
+    with Timer("Converting python data to TA data frame"):
+        data_frame = ta.dataframe({"text": text_list})
 
     with Timer("Running torcharrow's GPT2BPE tokenizer"):
         data_frame["tokens"] = ta_F.bpe_tokenize(tokenizer, data_frame["text"])
@@ -72,6 +78,9 @@ def run_torcharrow_ops():
         ta_F.add_tokens(data_frame["token_ids"], [0], begin=True)
         ta_F.add_tokens(data_frame["token_ids"], [-1], begin=False)
 
+    with Timer("Running torcharrow's to tensor conversion"):
+        data_frame.to_tensor({"token_ids": tap.PadSequence(padding_value=1)})
+
 
 if __name__ == "__main__":
     run_torchtext_ops()
diff --git a/packaging/torchtext/meta.yaml b/packaging/torchtext/meta.yaml
@@ -29,6 +29,7 @@ build:
   string: py{{py}}
   script_env:
     - BUILD_VERSION
+    - MACOSX_DEPLOYMENT_TARGET
 
 test:
   imports:
diff --git a/requirements.txt b/requirements.txt
@@ -12,7 +12,6 @@ git+https://github.com/jekbradbury/revtok.git
 
 # Documentation
 Sphinx
-sphinx_rtd_theme
 
 # Required for tests only:
 
diff --git a/test/datasets/test_cnndm.py b/test/datasets/test_cnndm.py
@@ -41,7 +41,7 @@ def _get_mock_dataset(root_dir):
                     stories.append((txt_file, dataset_line))
                 seed += 2
 
-            # append stories to correct dataset split, must be in legixographic order of filenames per dataset
+            # append stories to correct dataset split, must be in lexicographic order of filenames per dataset
             stories.sort(key=lambda x: x[0])
             mocked_data[split] += [t[1] for t in stories]
 
@@ -70,15 +70,14 @@ def tearDownClass(cls):
         cls.patcher.stop()
         super().tearDownClass()
 
-    def _mock_split_list(split):
+    def _mock_split_list(source, split):
         story_fnames = []
-        for source in ["cnn", "dailymail"]:
-            for i in range(5):
-                url = "_".join([source, split, str(i)])
-                h = hashlib.sha1()
-                h.update(url.encode())
-                filename = h.hexdigest() + ".story"
-                story_fnames.append(filename)
+        for i in range(5):
+            url = "_".join([source, split, str(i)])
+            h = hashlib.sha1()
+            h.update(url.encode())
+            filename = h.hexdigest() + ".story"
+            story_fnames.append(filename)
 
         return story_fnames
 
@@ -92,6 +91,7 @@ def test_cnndm(self, split):
             self.assertEqual(sample, expected_sample)
 
     @parameterized.expand(["train", "val", "test"])
+    @patch("torchtext.datasets.cnndm._get_split_list", _mock_split_list)
     def test_cnndm_split_argument(self, split):
         dataset1 = CNNDM(root=self.root_dir, split=split)
         (dataset2,) = CNNDM(root=self.root_dir, split=(split,))
diff --git a/torchtext/datasets/cnndm.py b/torchtext/datasets/cnndm.py
@@ -1,5 +1,6 @@
 import hashlib
 import os
+from collections import defaultdict
 from functools import partial
 from typing import Union, Tuple
 
@@ -20,9 +21,12 @@
 DATASET_NAME = "CNNDM"
 
 URL_LIST = {
-    "train": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt",
-    "val": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt",
-    "test": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt",
+    "cnn_train": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/cnn_wayback_training_urls.txt",
+    "cnn_val": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/cnn_wayback_validation_urls.txt",
+    "cnn_test": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/cnn_wayback_test_urls.txt",
+    "dailymail_train": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/dailymail_wayback_training_urls.txt",
+    "dailymail_val": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/dailymail_wayback_validation_urls.txt",
+    "dailymail_test": "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/dailymail_wayback_test_urls.txt",
 }
 
 STORIES_LIST = {
@@ -39,24 +43,34 @@
 
 _EXTRACTED_FOLDERS = {
     "cnn": os.path.join("cnn", "stories"),
-    "daily_mail": os.path.join("dailymail", "stories"),
+    "dailymail": os.path.join("dailymail", "stories"),
 }
 
+story_fnames = defaultdict(set)
+
 
 def _filepath_fn(root: str, source: str, _=None):
     return os.path.join(root, PATH_LIST[source])
 
 
-# this function will be used to cache the contents of the tar file
-def _extracted_filepath_fn(root: str, source: str):
-    return os.path.join(root, _EXTRACTED_FOLDERS[source])
+# called once per tar file, therefore no duplicate processing
+def _extracted_folder_fn(root: str, source: str, split: str, _=None):
+    global story_fnames
+    key = source + "_" + split
+    story_fnames[key] = set(_get_split_list(source, split))
+    filepaths = [os.path.join(root, _EXTRACTED_FOLDERS[source], story) for story in story_fnames[key]]
+    return filepaths
+
 
+def _extracted_filepath_fn(root: str, source: str, x: str):
+    return os.path.join(root, _EXTRACTED_FOLDERS[source], os.path.basename(x))
 
-def _filter_fn(story_fnames, x):
-    return os.path.basename(x[0]) in story_fnames
 
+def _filter_fn(source: str, split: str, x: tuple):
+    return os.path.basename(x[0]) in story_fnames[source + "_" + split]
 
-def _hash_urls(s):
+
+def _hash_urls(s: tuple):
     """
     Returns story filename as a heximal formated SHA1 hash of the input url string.
     Code is inspired from https://github.com/abisee/cnn-dailymail/blob/master/make_datafiles.py
@@ -69,23 +83,32 @@ def _hash_urls(s):
     return story_fname
 
 
-def _get_split_list(split: str):
-    url_dp = IterableWrapper([URL_LIST[split]])
+def _get_split_list(source: str, split: str):
+    url_dp = IterableWrapper([URL_LIST[source + "_" + split]])
     online_dp = OnlineReader(url_dp)
     return online_dp.readlines().map(fn=_hash_urls)
 
 
-def _load_stories(root: str, source: str):
+def _load_stories(root: str, source: str, split: str):
     story_dp = IterableWrapper([STORIES_LIST[source]])
     cache_compressed_dp = story_dp.on_disk_cache(
         filepath_fn=partial(_filepath_fn, root, source),
         hash_dict={_filepath_fn(root, source): STORIES_MD5[source]},
         hash_type="md5",
     )
     cache_compressed_dp = GDriveReader(cache_compressed_dp).end_caching(mode="wb", same_filepath_fn=True)
-    # TODO: cache the contents of the extracted tar file
-    cache_decompressed_dp = FileOpener(cache_compressed_dp, mode="b").load_from_tar()
-    return cache_decompressed_dp
+
+    cache_decompressed_dp = cache_compressed_dp.on_disk_cache(
+        filepath_fn=partial(_extracted_folder_fn, root, source, split)
+    )
+    cache_decompressed_dp = (
+        FileOpener(cache_decompressed_dp, mode="b").load_from_tar().filter(partial(_filter_fn, source, split))
+    )
+    cache_decompressed_dp = cache_decompressed_dp.end_caching(
+        mode="wb", filepath_fn=partial(_extracted_filepath_fn, root, source)
+    )
+    data_dp = FileOpener(cache_decompressed_dp, mode="b")
+    return data_dp
 
 
 @_create_dataset_directory(dataset_name=DATASET_NAME)
@@ -119,11 +142,7 @@ def CNNDM(root: str, split: Union[Tuple[str], str]):
             "Package `torchdata` not found. Please install following instructions at https://github.com/pytorch/data"
         )
 
-    cnn_dp = _load_stories(root, "cnn")
-    dailymail_dp = _load_stories(root, "dailymail")
+    cnn_dp = _load_stories(root, "cnn", split)
+    dailymail_dp = _load_stories(root, "dailymail", split)
     data_dp = cnn_dp.concat(dailymail_dp)
-    # TODO: store the .story filenames corresponding to each split on disk so we can pass that into the filepath_fn
-    # of the on_disk_cache_dp which caches the files extracted from the tar
-    story_fnames = set(_get_split_list(split))
-    data_dp = data_dp.filter(partial(_filter_fn, story_fnames))
     return data_dp.parse_cnndm_data().shuffle().set_shuffle(False).sharding_filter()
diff --git a/torchtext/transforms.py b/torchtext/transforms.py
@@ -572,7 +572,9 @@ def __init__(
         self, vocab_path: str, do_lower_case: bool = True, strip_accents: Optional[bool] = None, return_tokens=False
     ) -> None:
         super().__init__()
-        self.bert_model = BERTEncoderPyBind(get_asset_local_path(vocab_path), do_lower_case, strip_accents)
+        self.bert_model = BERTEncoderPyBind(
+            get_asset_local_path(vocab_path, overwite=True), do_lower_case, strip_accents
+        )
         self._return_tokens = return_tokens
         self._vocab_path = vocab_path
         self._do_lower_case = do_lower_case
diff --git a/torchtext/utils.py b/torchtext/utils.py
@@ -209,10 +209,11 @@ def _log_class_usage(klass):
     torch._C._log_api_usage_once(identifier)
 
 
-def get_asset_local_path(asset_path: str) -> str:
+def get_asset_local_path(asset_path: str, overwite=False) -> str:
     """Get local path for assets. Download if path does not exost locally
     Args:
         asset_path: Local path to asset or remote URL
+        overwrite: Indicate whether to overwrite the file when downloading from URL (default: False)
     Returns:
         bool: local path of the asset after downloading or reading from cache
     Examples:
@@ -225,5 +226,5 @@ def get_asset_local_path(asset_path: str) -> str:
     if os.path.exists(asset_path):
         local_path = asset_path
     else:
-        local_path = download_from_url(url=asset_path, root=_CACHE_DIR)
+        local_path = download_from_url(url=asset_path, root=_CACHE_DIR, overwrite=overwite)
     return local_path