From 9552f9a23dc07af155c323213fc4fc8821b4426b Mon Sep 17 00:00:00 2001 From: mengniwa Date: Thu, 24 Nov 2022 10:27:21 +0800 Subject: [PATCH] Add dataloader doc_string Signed-off-by: mengniwa --- .../experimental/data/__init__.py | 3 +- .../experimental/data/dataloaders/__init__.py | 2 + .../data/dataloaders/base_dataloader.py | 47 +++++++++++-- .../data/dataloaders/dataloader.py | 2 + .../data/dataloaders/default_dataloader.py | 30 +++++++-- .../experimental/data/dataloaders/fetcher.py | 50 ++++++++++++++ .../data/dataloaders/onnxrt_dataloader.py | 6 ++ .../experimental/data/dataloaders/sampler.py | 66 +++++++++++-------- 8 files changed, 169 insertions(+), 37 deletions(-) diff --git a/neural_compressor/experimental/data/__init__.py b/neural_compressor/experimental/data/__init__.py index 3582b6f4c71..c5aad2c00eb 100644 --- a/neural_compressor/experimental/data/__init__.py +++ b/neural_compressor/experimental/data/__init__.py @@ -14,7 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +# ============================================================================== +"""Built-in dataloaders, datasets, transforms, filters for multiple framework backends.""" from .datasets import DATASETS, Dataset, IterableDataset, dataset_registry from .transforms import TRANSFORMS, BaseTransform, transform_registry diff --git a/neural_compressor/experimental/data/dataloaders/__init__.py b/neural_compressor/experimental/data/dataloaders/__init__.py index e5a14f2077e..867409b93c0 100644 --- a/neural_compressor/experimental/data/dataloaders/__init__.py +++ b/neural_compressor/experimental/data/dataloaders/__init__.py @@ -14,6 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== +"""Built-in dataloaders for multiple framework backends.""" from .dataloader import DATALOADERS diff --git a/neural_compressor/experimental/data/dataloaders/base_dataloader.py b/neural_compressor/experimental/data/dataloaders/base_dataloader.py index 118c06cf120..5e58add4f15 100644 --- a/neural_compressor/experimental/data/dataloaders/base_dataloader.py +++ b/neural_compressor/experimental/data/dataloaders/base_dataloader.py @@ -14,21 +14,40 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== +"""BaseDataloder of all dataloaders.""" from abc import abstractmethod -class BaseDataLoader(object): - """Base class for all DataLoaders. _generate_dataloader is needed to create a dataloader object - from the general params like batch_size and sampler. The dynamic batching is just to - generate a new dataloader by setting batch_size and last_batch. +class BaseDataLoader: + """Base class for all DataLoaders. - """ + _generate_dataloader is needed to create a dataloader object + from the general params like batch_size and sampler. The dynamic batching is just to + generate a new dataloader by setting batch_size and last_batch. + """ + def __init__(self, dataset, batch_size=1, last_batch='rollover', collate_fn=None, sampler=None, batch_sampler=None, num_workers=0, pin_memory=False, shuffle=False, distributed=False): + """Initialize BaseDataLoader. + Args: + dataset (object): dataset from which to load the data + batch_size (int, optional): number of samples per batch. Defaults to 1. + last_batch (str, optional): whether to drop the last batch if it is incomplete. + Support ['rollover', 'discard'], rollover means False, discard means True. + Defaults to 'rollover'. + collate_fn (callable, optional): merge data with outer dimension batch size. Defaults to None. + sampler (Sampler, optional): Sampler object to sample data. Defaults to None. + batch_sampler (BatchSampler, optional): BatchSampler object to generate batch of indices. Defaults to None. + num_workers (int, optional): number of subprocesses to use for data loading. Defaults to 0. + pin_memory (bool, optional): whether to copy data into pinned memory before returning. Defaults to False. + shuffle (bool, optional): whether to shuffle data. Defaults to False. + distributed (bool, optional): whether the dataloader is distributed. Defaults to False. + """ self.dataset = dataset self.collate_fn = collate_fn self.sampler = sampler @@ -54,6 +73,14 @@ def __init__(self, dataset, batch_size=1, last_batch='rollover', collate_fn=None distributed=distributed) def batch(self, batch_size, last_batch=None): + """Set batch size for dataloader. + + Args: + batch_size (int): number of samples per batch. + last_batch (str, optional): whether to drop the last batch if it is incomplete. + Support ['rollover', 'discard'], rollover means False, discard means True. + Defaults to None. + """ self._batch_size = batch_size if last_batch is not None: self.last_batch = last_batch @@ -71,9 +98,19 @@ def batch(self, batch_size, last_batch=None): @property def batch_size(self): + """Get dataloader's batch_size. + + Returns: + int: batch_size + """ return self._batch_size def __iter__(self): + """Yield data in iterative order. + + Returns: + iterator: iterator for dataloder + """ return iter(self.dataloader) @abstractmethod diff --git a/neural_compressor/experimental/data/dataloaders/dataloader.py b/neural_compressor/experimental/data/dataloaders/dataloader.py index 69c6fa29ae2..c3463b875eb 100644 --- a/neural_compressor/experimental/data/dataloaders/dataloader.py +++ b/neural_compressor/experimental/data/dataloaders/dataloader.py @@ -14,6 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== +"""Built-in dataloaders for multiple framework backends.""" from .tensorflow_dataloader import TensorflowDataLoader from .mxnet_dataloader import MXNetDataLoader diff --git a/neural_compressor/experimental/data/dataloaders/default_dataloader.py b/neural_compressor/experimental/data/dataloaders/default_dataloader.py index bc1c8db15f6..85a18e5c81c 100644 --- a/neural_compressor/experimental/data/dataloaders/default_dataloader.py +++ b/neural_compressor/experimental/data/dataloaders/default_dataloader.py @@ -14,6 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== +"""Default dataloader for multiple framework backends.""" import collections import numpy as np @@ -24,7 +26,7 @@ from .base_dataloader import BaseDataLoader def default_collate(batch): - """Puts each data field into a pd frame with outer dimension batch size""" + """Merge data with outer dimension batch size.""" elem = batch[0] if isinstance(elem, collections.abc.Mapping): return {key: default_collate([d[key] for d in batch]) for key in elem} @@ -40,13 +42,27 @@ def default_collate(batch): return batch class DefaultDataLoader(BaseDataLoader): - """DefaultDataLoader - - """ - + """DefaultDataLoader for multiple framework backends.""" + def __init__(self, dataset, batch_size=1, last_batch='rollover', collate_fn=None, sampler=None, batch_sampler=None, num_workers=0, pin_memory=False, shuffle=False, distributed=False): + """Initialize DefaultDataLoader. + + Args: + dataset (object): dataset from which to load the data + batch_size (int, optional): number of samples per batch. Defaults to 1. + last_batch (str, optional): whether to drop the last batch if it is incomplete. + Support ['rollover', 'discard'], rollover means False, discard means True. + Defaults to 'rollover'. + collate_fn (callable, optional): merge data with outer dimension batch size. Defaults to None. + sampler (Sampler, optional): Sampler object to sample data. Defaults to None. + batch_sampler (BatchSampler, optional): BatchSampler object to generate batch of indices. Defaults to None. + num_workers (int, optional): number of subprocesses to use for data loading. Defaults to 0. + pin_memory (bool, optional): whether to copy data into pinned memory before returning. Defaults to False. + shuffle (bool, optional): whether to shuffle data. Defaults to False. + distributed (bool, optional): whether the dataloader is distributed. Defaults to False. + """ self.dataset = dataset self.last_batch = last_batch self.sampler = sampler @@ -62,14 +78,17 @@ def __init__(self, dataset, batch_size=1, last_batch='rollover', collate_fn=None self.collate_fn = default_collate def batch(self, batch_size, last_batch='rollover'): + """Set batch_size and last_batch.""" self._batch_size = batch_size self.last_batch = last_batch @property def dataloader(self): + """Return dataloader.""" return self def __iter__(self): + """Yield data in iterative order.""" return self._generate_dataloader( self.dataset, batch_size=self.batch_size, @@ -83,6 +102,7 @@ def __iter__(self): distributed=self.distributed) def __len__(self): + """Get dataset length.""" try: dataset_len = self.dataset.__len__() except (AttributeError, TypeError): diff --git a/neural_compressor/experimental/data/dataloaders/fetcher.py b/neural_compressor/experimental/data/dataloaders/fetcher.py index d24322d7f92..0f1d3aac70c 100644 --- a/neural_compressor/experimental/data/dataloaders/fetcher.py +++ b/neural_compressor/experimental/data/dataloaders/fetcher.py @@ -14,21 +14,49 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== +"""Definitions of the methods to fetch data from an iterable-style or list-style dataset.""" from abc import abstractmethod class Fetcher(object): + """Base class for different fetchers.""" + def __init__(self, dataset, collate_fn, drop_last): + """Initialize Fetcher. + + Args: + dataset (object): dataset object from which to get data + collate_fn (callable): merge data with outer dimension batch size + drop_last (bool): whether to drop the last batch if it is incomplete + """ self.dataset = dataset self.collate_fn = collate_fn self.drop_last = drop_last @abstractmethod def __call__(self, batched_indices): + """Fetch data. + + Args: + batched_indices (list): fetch data according to batched_indices + + """ raise NotImplementedError class IterableFetcher(Fetcher): + """Iterate to get next batch-size samples as a batch.""" + def __init__(self, dataset, collate_fn, drop_last, distributed): + """Initialize IterableFetcher. + + Args: + dataset (object): dataset object from which to get data + collate_fn (callable): merge data with outer dimension batch size + drop_last (bool): whether to drop the last batch if it is incomplete + distributed (bool): whether the dataloader is distributed + + """ super(IterableFetcher, self).__init__(dataset, collate_fn, drop_last) self.dataset_iter = iter(dataset) self.index_whole = 0 @@ -47,6 +75,12 @@ def __init__(self, dataset, collate_fn, drop_last, distributed): " please set 'distributed: True' and launch multiple processes.") def __call__(self, batched_indices): + """Fetch data. + + Args: + batched_indices (list): fetch data according to batched_indices + + """ batch_data = [] batch_size = len(batched_indices) while True: @@ -64,10 +98,26 @@ def __call__(self, batched_indices): return self.collate_fn(batch_data) class IndexFetcher(Fetcher): + """Take single index or a batch of indices to fetch samples as a batch.""" + def __init__(self, dataset, collate_fn, drop_last, distributed): + """Initialize IndexFetcher. + + Args: + dataset (object): dataset object from which to get data + collate_fn (callable): merge data with outer dimension batch size + drop_last (bool): whether to drop the last batch if it is incomplete + distributed (bool): whether the dataloader is distributed + """ super(IndexFetcher, self).__init__(dataset, collate_fn, drop_last) def __call__(self, batched_indices): + """Fetch data. + + Args: + batched_indices (list): fetch data according to batched_indices + + """ data = [self.dataset[idx] for idx in batched_indices] return self.collate_fn(data) diff --git a/neural_compressor/experimental/data/dataloaders/onnxrt_dataloader.py b/neural_compressor/experimental/data/dataloaders/onnxrt_dataloader.py index d47e8596e7c..fd567a001f4 100644 --- a/neural_compressor/experimental/data/dataloaders/onnxrt_dataloader.py +++ b/neural_compressor/experimental/data/dataloaders/onnxrt_dataloader.py @@ -14,6 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== +"""Built-in dataloaders for onnxruntime framework backends.""" from neural_compressor.utils.utility import LazyImport from .base_dataloader import BaseDataLoader @@ -23,6 +25,8 @@ torch = LazyImport('torch') class ONNXRTBertDataLoader(DefaultDataLoader): + """Built-in dataloader for onnx bert model and its varients.""" + def _generate_dataloader(self, dataset, batch_size, last_batch, collate_fn, sampler, batch_sampler, num_workers, pin_memory, shuffle, distributed): @@ -59,6 +63,8 @@ def _generate_dataloader(self, dataset, batch_size, last_batch, collate_fn, return class ONNXRTDataLoader(BaseDataLoader): + """Built-in dataloader for onnxruntime framework backends.""" + def _generate_dataloader(self, dataset, batch_size, last_batch, collate_fn, sampler, batch_sampler, num_workers, pin_memory, shuffle, distributed): diff --git a/neural_compressor/experimental/data/dataloaders/sampler.py b/neural_compressor/experimental/data/dataloaders/sampler.py index 793c8a4db9c..cbe9220bc7c 100644 --- a/neural_compressor/experimental/data/dataloaders/sampler.py +++ b/neural_compressor/experimental/data/dataloaders/sampler.py @@ -14,57 +14,70 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== +"""Definitions of the methods to sample data.""" from abc import abstractmethod class Sampler(object): - """Base class for all Samplers. __iter__ is needed no matter whether you use IterableSampler - or Squential sampler, if you want implement your own sampler, make clear what the type is - your Dataset, if IterableDataset(method __iter__ implemented), try to use IterableSampler, - else if you have an IndexDataset(method __getitem__ implemented), your dataset should have - method __len__ implemented. - + """Base class for all Samplers. + + __iter__ is needed no matter whether you use IterableSampler + or Squential sampler, if you want implement your own sampler, make clear what the type is + your Dataset, if IterableDataset(method __iter__ implemented), try to use IterableSampler, + else if you have an IndexDataset(method __getitem__ implemented), your dataset should have + method __len__ implemented. """ def __init__(self, data_source): + """Initialize Sampler.""" pass @abstractmethod def __iter__(self): + """Convert dataloder to an iterator.""" raise NotImplementedError class IterableSampler(Sampler): - """Interally samples elements, used for datasets retrieved element by interator. - yield None to act as a placeholder for each iteration - - Args: - dataset (Dataset): set to None + """Interally samples elements. + + Used for datasets retrieved element by interator. Yield None to act as a placeholder for each iteration. """ def __init__(self, dataset): + """Initialize IterableSampler. + + Args: + dataset (object): dataset object from which to get data + """ super(IterableSampler, self).__init__(None) self.whole_dataset = dataset def __iter__(self): + """Yield data in iterative order.""" while True: yield None def __len__(self): + """Return the length of dataset.""" raise NotImplementedError("'__len__' for IterableDataset object has not defined") class SequentialSampler(Sampler): - """Sequentially samples elements, used for datasets retrieved element by index. - - Args: - dataset (Dataset): index dataset(implement method __len__) for sampling - """ + """Sequentially samples elements, used for datasets retrieved element by index.""" def __init__(self, dataset, distributed): + """Initialize SequentialSampler. + + Args: + dataset (object): dataset object from which to get data + distributed (bool): whether the dataloader is distributed + """ self.whole_dataset = dataset self.distributed = distributed def __iter__(self): + """Yield data in iterative order.""" self.process_rank = 0 # The default rank is 0, which represents the main process self.process_size = 1 # By default, process_size=1, only the main process is running if self.distributed: @@ -81,21 +94,20 @@ def __iter__(self): return iter(range(self.process_rank, len(self.whole_dataset), self.process_size)) def __len__(self): + """Return the length of dataset.""" return len(self.whole_dataset) class BatchSampler(Sampler): - """yield a mini-batch of indices for SquentialSampler and batch size length of None list for - IterableSampler. - - Args: - sampler (Sampler): sampler used for generating batches. - batch_size (int): Size of mini-batch. - drop_last (bool): BatchSampler will drop the last batch if drop_last is True, else - will return the last batch whose size will be less than batch_size - - """ + """Yield a batch of indices and number of batches.""" def __init__(self, sampler, batch_size, drop_last=True): + """Initialize BatchSampler. + + Args: + sampler (Sampler): sampler used for generating batches + batch_size (int): size of batch + drop_last (bool, optional): whether to drop the last batch if it is incomplete. Defaults to True. + """ if isinstance(drop_last, bool): self.drop_last = drop_last else: @@ -106,6 +118,7 @@ def __init__(self, sampler, batch_size, drop_last=True): self.drop_last = drop_last def __iter__(self): + """Yield data in iterative order.""" batch = [] for idx in self.sampler: batch.append(idx) @@ -116,6 +129,7 @@ def __iter__(self): yield batch def __len__(self): + """Return the number of batches.""" if self.drop_last: return len(self.sampler) // self.batch_size else: