Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion neural_compressor/experimental/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Built-in dataloaders, datasets, transforms, filters for multiple framework backends."""

"""Built-in datasets class for multiple framework backends."""

from .datasets import DATASETS, Dataset, IterableDataset, dataset_registry
from .transforms import TRANSFORMS, BaseTransform, transform_registry
Expand Down
2 changes: 2 additions & 0 deletions neural_compressor/experimental/data/dataloaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Built-in dataloaders for multiple framework backends."""

from .dataloader import DATALOADERS

Expand Down
47 changes: 42 additions & 5 deletions neural_compressor/experimental/data/dataloaders/base_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,40 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""BaseDataloder of all dataloaders."""

from abc import abstractmethod


class BaseDataLoader(object):
"""Base class for all DataLoaders. _generate_dataloader is needed to create a dataloader object
from the general params like batch_size and sampler. The dynamic batching is just to
generate a new dataloader by setting batch_size and last_batch.
class BaseDataLoader:
"""Base class for all DataLoaders.

"""
_generate_dataloader is needed to create a dataloader object
from the general params like batch_size and sampler. The dynamic batching is just to
generate a new dataloader by setting batch_size and last_batch.

"""

def __init__(self, dataset, batch_size=1, last_batch='rollover', collate_fn=None,
sampler=None, batch_sampler=None, num_workers=0, pin_memory=False,
shuffle=False, distributed=False):
"""Initialize BaseDataLoader.

Args:
dataset (object): dataset from which to load the data
batch_size (int, optional): number of samples per batch. Defaults to 1.
last_batch (str, optional): whether to drop the last batch if it is incomplete.
Support ['rollover', 'discard'], rollover means False, discard means True.
Defaults to 'rollover'.
collate_fn (callable, optional): merge data with outer dimension batch size. Defaults to None.
sampler (Sampler, optional): Sampler object to sample data. Defaults to None.
batch_sampler (BatchSampler, optional): BatchSampler object to generate batch of indices. Defaults to None.
num_workers (int, optional): number of subprocesses to use for data loading. Defaults to 0.
pin_memory (bool, optional): whether to copy data into pinned memory before returning. Defaults to False.
shuffle (bool, optional): whether to shuffle data. Defaults to False.
distributed (bool, optional): whether the dataloader is distributed. Defaults to False.
"""
self.dataset = dataset
self.collate_fn = collate_fn
self.sampler = sampler
Expand All @@ -54,6 +73,14 @@ def __init__(self, dataset, batch_size=1, last_batch='rollover', collate_fn=None
distributed=distributed)

def batch(self, batch_size, last_batch=None):
"""Set batch size for dataloader.

Args:
batch_size (int): number of samples per batch.
last_batch (str, optional): whether to drop the last batch if it is incomplete.
Support ['rollover', 'discard'], rollover means False, discard means True.
Defaults to None.
"""
self._batch_size = batch_size
if last_batch is not None:
self.last_batch = last_batch
Expand All @@ -71,9 +98,19 @@ def batch(self, batch_size, last_batch=None):

@property
def batch_size(self):
"""Get dataloader's batch_size.

Returns:
int: batch_size
"""
return self._batch_size

def __iter__(self):
"""Yield data in iterative order.

Returns:
iterator: iterator for dataloder
"""
return iter(self.dataloader)

@abstractmethod
Expand Down
2 changes: 2 additions & 0 deletions neural_compressor/experimental/data/dataloaders/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Built-in dataloaders for multiple framework backends."""

from .tensorflow_dataloader import TensorflowDataLoader
from .mxnet_dataloader import MXNetDataLoader
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Default dataloader for multiple framework backends."""

import collections
import numpy as np
Expand All @@ -24,7 +26,7 @@
from .base_dataloader import BaseDataLoader

def default_collate(batch):
"""Puts each data field into a pd frame with outer dimension batch size"""
"""Merge data with outer dimension batch size."""
elem = batch[0]
if isinstance(elem, collections.abc.Mapping):
return {key: default_collate([d[key] for d in batch]) for key in elem}
Expand All @@ -40,13 +42,27 @@ def default_collate(batch):
return batch

class DefaultDataLoader(BaseDataLoader):
"""DefaultDataLoader

"""

"""DefaultDataLoader for multiple framework backends."""

def __init__(self, dataset, batch_size=1, last_batch='rollover', collate_fn=None,
sampler=None, batch_sampler=None, num_workers=0, pin_memory=False,
shuffle=False, distributed=False):
"""Initialize DefaultDataLoader.

Args:
dataset (object): dataset from which to load the data
batch_size (int, optional): number of samples per batch. Defaults to 1.
last_batch (str, optional): whether to drop the last batch if it is incomplete.
Support ['rollover', 'discard'], rollover means False, discard means True.
Defaults to 'rollover'.
collate_fn (callable, optional): merge data with outer dimension batch size. Defaults to None.
sampler (Sampler, optional): Sampler object to sample data. Defaults to None.
batch_sampler (BatchSampler, optional): BatchSampler object to generate batch of indices. Defaults to None.
num_workers (int, optional): number of subprocesses to use for data loading. Defaults to 0.
pin_memory (bool, optional): whether to copy data into pinned memory before returning. Defaults to False.
shuffle (bool, optional): whether to shuffle data. Defaults to False.
distributed (bool, optional): whether the dataloader is distributed. Defaults to False.
"""
self.dataset = dataset
self.last_batch = last_batch
self.sampler = sampler
Expand All @@ -62,14 +78,17 @@ def __init__(self, dataset, batch_size=1, last_batch='rollover', collate_fn=None
self.collate_fn = default_collate

def batch(self, batch_size, last_batch='rollover'):
"""Set batch_size and last_batch."""
self._batch_size = batch_size
self.last_batch = last_batch

@property
def dataloader(self):
"""Return dataloader."""
return self

def __iter__(self):
"""Yield data in iterative order."""
return self._generate_dataloader(
self.dataset,
batch_size=self.batch_size,
Expand All @@ -83,6 +102,7 @@ def __iter__(self):
distributed=self.distributed)

def __len__(self):
"""Get dataset length."""
try:
dataset_len = self.dataset.__len__()
except (AttributeError, TypeError):
Expand Down
50 changes: 50 additions & 0 deletions neural_compressor/experimental/data/dataloaders/fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,49 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Definitions of the methods to fetch data from an iterable-style or list-style dataset."""

from abc import abstractmethod

class Fetcher(object):
"""Base class for different fetchers."""

def __init__(self, dataset, collate_fn, drop_last):
"""Initialize Fetcher.

Args:
dataset (object): dataset object from which to get data
collate_fn (callable): merge data with outer dimension batch size
drop_last (bool): whether to drop the last batch if it is incomplete
"""
self.dataset = dataset
self.collate_fn = collate_fn
self.drop_last = drop_last

@abstractmethod
def __call__(self, batched_indices):
"""Fetch data.

Args:
batched_indices (list): fetch data according to batched_indices

"""
raise NotImplementedError

class IterableFetcher(Fetcher):
"""Iterate to get next batch-size samples as a batch."""

def __init__(self, dataset, collate_fn, drop_last, distributed):
"""Initialize IterableFetcher.

Args:
dataset (object): dataset object from which to get data
collate_fn (callable): merge data with outer dimension batch size
drop_last (bool): whether to drop the last batch if it is incomplete
distributed (bool): whether the dataloader is distributed

"""
super(IterableFetcher, self).__init__(dataset, collate_fn, drop_last)
self.dataset_iter = iter(dataset)
self.index_whole = 0
Expand All @@ -47,6 +75,12 @@ def __init__(self, dataset, collate_fn, drop_last, distributed):
" please set 'distributed: True' and launch multiple processes.")

def __call__(self, batched_indices):
"""Fetch data.

Args:
batched_indices (list): fetch data according to batched_indices

"""
batch_data = []
batch_size = len(batched_indices)
while True:
Expand All @@ -64,10 +98,26 @@ def __call__(self, batched_indices):
return self.collate_fn(batch_data)

class IndexFetcher(Fetcher):
"""Take single index or a batch of indices to fetch samples as a batch."""

def __init__(self, dataset, collate_fn, drop_last, distributed):
"""Initialize IndexFetcher.

Args:
dataset (object): dataset object from which to get data
collate_fn (callable): merge data with outer dimension batch size
drop_last (bool): whether to drop the last batch if it is incomplete
distributed (bool): whether the dataloader is distributed
"""
super(IndexFetcher, self).__init__(dataset, collate_fn, drop_last)

def __call__(self, batched_indices):
"""Fetch data.

Args:
batched_indices (list): fetch data according to batched_indices

"""
data = [self.dataset[idx] for idx in batched_indices]
return self.collate_fn(data)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Built-in dataloaders for onnxruntime framework backends."""

from neural_compressor.utils.utility import LazyImport
from .base_dataloader import BaseDataLoader
Expand All @@ -23,6 +25,8 @@
torch = LazyImport('torch')

class ONNXRTBertDataLoader(DefaultDataLoader):
"""Built-in dataloader for onnx bert model and its varients."""

def _generate_dataloader(self, dataset, batch_size, last_batch, collate_fn,
sampler, batch_sampler, num_workers, pin_memory,
shuffle, distributed):
Expand Down Expand Up @@ -59,6 +63,8 @@ def _generate_dataloader(self, dataset, batch_size, last_batch, collate_fn,
return

class ONNXRTDataLoader(BaseDataLoader):
"""Built-in dataloader for onnxruntime framework backends."""

def _generate_dataloader(self, dataset, batch_size, last_batch, collate_fn,
sampler, batch_sampler, num_workers, pin_memory,
shuffle, distributed):
Expand Down
Loading