diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 5a1cbf8167..33eb44b21d 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -32,84 +32,74 @@ AG_NEWS .. autofunction:: AG_NEWS +AmazonReviewFull +~~~~~~~~~~~~~~~~ -SogouNews -~~~~~~~~~ +.. autofunction:: AmazonReviewFull -.. autofunction:: SogouNews +AmazonReviewPolarity +~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: AmazonReviewPolarity DBpedia ~~~~~~~ .. autofunction:: DBpedia -YelpReviewPolarity -~~~~~~~~~~~~~~~~~~ +IMDb +~~~~ -.. autofunction:: YelpReviewPolarity +.. autofunction:: IMDB -YelpReviewFull -~~~~~~~~~~~~~~ +SogouNews +~~~~~~~~~ -.. autofunction:: YelpReviewFull +.. autofunction:: SogouNews + +SST2 +~~~~ + +.. autofunction:: SST2 YahooAnswers ~~~~~~~~~~~~ .. autofunction:: YahooAnswers -AmazonReviewPolarity -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: AmazonReviewPolarity - -AmazonReviewFull -~~~~~~~~~~~~~~~~ - -.. autofunction:: AmazonReviewFull - -IMDb -~~~~ +YelpReviewFull +~~~~~~~~~~~~~~ -.. autofunction:: IMDB +.. autofunction:: YelpReviewFull -SST2 -~~~~ +YelpReviewPolarity +~~~~~~~~~~~~~~~~~~ -.. autofunction:: SST2 +.. autofunction:: YelpReviewPolarity Language Modeling ^^^^^^^^^^^^^^^^^ +PennTreebank +~~~~~~~~~~~~ + +.. autofunction:: PennTreebank + WikiText-2 ~~~~~~~~~~ .. autofunction:: WikiText2 - WikiText103 ~~~~~~~~~~~ .. autofunction:: WikiText103 -PennTreebank -~~~~~~~~~~~~ - -.. autofunction:: PennTreebank - - Machine Translation ^^^^^^^^^^^^^^^^^^^ -Multi30k -~~~~~~~~ - -.. autofunction:: Multi30k - - - IWSLT2016 ~~~~~~~~~ @@ -120,20 +110,25 @@ IWSLT2017 .. autofunction:: IWSLT2017 +Multi30k +~~~~~~~~ -Sequence Tagging -^^^^^^^^^^^^^^^^ +.. autofunction:: Multi30k -UDPOS -~~~~~ -.. autofunction:: UDPOS +Sequence Tagging +^^^^^^^^^^^^^^^^ CoNLL2000Chunking ~~~~~~~~~~~~~~~~~ .. autofunction:: CoNLL2000Chunking +UDPOS +~~~~~ + +.. autofunction:: UDPOS + Question Answer ^^^^^^^^^^^^^^^ @@ -153,6 +148,11 @@ SQuAD 2.0 Unsupervised Learning ^^^^^^^^^^^^^^^^^^^^^ +CC100 +~~~~~~ + +.. autofunction:: CC100 + EnWik9 ~~~~~~ diff --git a/torchtext/datasets/cc100.py b/torchtext/datasets/cc100.py index 8949f30b0c..1fb3dfe0f1 100644 --- a/torchtext/datasets/cc100.py +++ b/torchtext/datasets/cc100.py @@ -30,6 +30,17 @@ @_create_dataset_directory(dataset_name=DATASET_NAME) def CC100(root: str, language_code: str = "en"): + """CC100 Dataset + + For additional details refer to https://data.statmt.org/cc-100/ + + Args: + root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache') + language_code: the language of the dataset + + :returns: DataPipe that yields tuple of language code and text + :rtype: (str, str) + """ if language_code not in VALID_CODES: raise ValueError(f"Invalid language code {language_code}") diff --git a/torchtext/datasets/conll2000chunking.py b/torchtext/datasets/conll2000chunking.py index 3f6540aacd..917461bc1a 100644 --- a/torchtext/datasets/conll2000chunking.py +++ b/torchtext/datasets/conll2000chunking.py @@ -39,9 +39,8 @@ def CoNLL2000Chunking(root: str, split: Union[Tuple[str], str]): For additional details refer to https://www.clips.uantwerpen.be/conll2000/chunking/ Number of lines per split: - train: 8936 - - test: 2012 + - train: 8936 + - test: 2012 Args: root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache') diff --git a/torchtext/datasets/multi30k.py b/torchtext/datasets/multi30k.py index 12519b1679..8056737b4a 100644 --- a/torchtext/datasets/multi30k.py +++ b/torchtext/datasets/multi30k.py @@ -47,6 +47,11 @@ def Multi30k( For additional details refer to https://www.statmt.org/wmt16/multimodal-task.html#task1 + Number of lines per split: + - train: 29000 + - valid: 1014 + - test: 1000 + Args: root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache') split: split or splits to be returned. Can be a string or tuple of strings. Default: ('train', 'valid', 'test') diff --git a/torchtext/datasets/squad1.py b/torchtext/datasets/squad1.py index 0573395493..3f9b4aa8a4 100644 --- a/torchtext/datasets/squad1.py +++ b/torchtext/datasets/squad1.py @@ -38,10 +38,8 @@ def SQuAD1(root: str, split: Union[Tuple[str], str]): For additional details refer to https://rajpurkar.github.io/SQuAD-explorer/ Number of lines per split: - train: 87599 - - Dev: 10570 - + - train: 87599 + - dev: 10570 Args: root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache') diff --git a/torchtext/datasets/squad2.py b/torchtext/datasets/squad2.py index e8549dfd23..67e7bdf411 100644 --- a/torchtext/datasets/squad2.py +++ b/torchtext/datasets/squad2.py @@ -38,9 +38,8 @@ def SQuAD2(root: str, split: Union[Tuple[str], str]): For additional details refer to https://rajpurkar.github.io/SQuAD-explorer/ Number of lines per split: - train: 130319 - - Dev: 11873 + - train: 130319 + - dev: 11873 Args: diff --git a/torchtext/datasets/sst2.py b/torchtext/datasets/sst2.py index 45cae58c9c..e2d0d48883 100644 --- a/torchtext/datasets/sst2.py +++ b/torchtext/datasets/sst2.py @@ -3,7 +3,6 @@ from torchtext._internal.module_utils import is_module_available from torchtext.data.datasets_utils import ( - _add_docstring_header, _create_dataset_directory, _wrap_split_argument, ) @@ -37,10 +36,25 @@ } -@_add_docstring_header(num_lines=NUM_LINES, num_classes=2) @_create_dataset_directory(dataset_name=DATASET_NAME) @_wrap_split_argument(("train", "dev", "test")) def SST2(root, split): + """SST2 Dataset + + For additional details refer to https://nlp.stanford.edu/sentiment/ + + Number of lines per split: + - train: 67349 + - dev: 872 + - test: 1821 + + Args: + root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache') + split: split or splits to be returned. Can be a string or tuple of strings. Default: (`train`, `dev`, `test`) + + :returns: DataPipe that yields tuple of text and/or label (1 to 4). The `test` split only returns text. + :rtype: Union[(int, str), (str,)] + """ # TODO Remove this after removing conditional dependency if not is_module_available("torchdata"): raise ModuleNotFoundError( diff --git a/torchtext/datasets/udpos.py b/torchtext/datasets/udpos.py index c2aef3c530..e5a850fbf3 100644 --- a/torchtext/datasets/udpos.py +++ b/torchtext/datasets/udpos.py @@ -33,11 +33,9 @@ def UDPOS(root: str, split: Union[Tuple[str], str]): """UDPOS Dataset Number of lines per split: - train: 12543 - - valid: 2002 - - test: 2077 + - train: 12543 + - valid: 2002 + - test: 2077 Args: root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')