Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions tensorflow_io/core/python/ops/hdf5_io_tensor_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""HDF5IOTensor"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import uuid

import tensorflow as tf
from tensorflow_io.core.python.ops import io_tensor_ops
from tensorflow_io.core.python.ops import core_ops

class HDF5IOTensor(io_tensor_ops._CollectionIOTensor): # pylint: disable=protected-access
"""HDF5IOTensor"""

#=============================================================================
# Constructor (private)
#=============================================================================
def __init__(self,
filename,
internal=False):
with tf.name_scope("HDF5IOTensor") as scope:
resource, columns = core_ops.hdf5_indexable_init(
filename,
container=scope,
shared_name="%s/%s" % (filename, uuid.uuid4().hex))
columns = [column.decode() for column in columns.numpy().tolist()]
spec = []
for column in columns:
shape, dtype = core_ops.hdf5_indexable_spec(resource, column)
shape = tf.TensorShape(shape)
dtype = tf.as_dtype(dtype.numpy())
spec.append(tf.TensorSpec(shape, dtype, column))
spec = tuple(spec)
super(HDF5IOTensor, self).__init__(
spec, columns,
resource, core_ops.hdf5_indexable_get_item,
internal=internal)
18 changes: 18 additions & 0 deletions tensorflow_io/core/python/ops/io_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from tensorflow_io.core.python.ops import io_tensor_ops
from tensorflow_io.core.python.ops import audio_io_tensor_ops
from tensorflow_io.core.python.ops import json_io_tensor_ops
from tensorflow_io.core.python.ops import hdf5_io_tensor_ops
from tensorflow_io.core.python.ops import kafka_io_tensor_ops
from tensorflow_io.core.python.ops import lmdb_io_tensor_ops
from tensorflow_io.core.python.ops import prometheus_io_tensor_ops
Expand Down Expand Up @@ -346,3 +347,20 @@ def from_lmdb(cls,
"""
with tf.name_scope(kwargs.get("name", "IOFromLMDB")):
return lmdb_io_tensor_ops.LMDBIOTensor(filename, internal=True)

@classmethod
def from_hdf5(cls,
filename,
**kwargs):
"""Creates an `IOTensor` from an hdf5 file.

Args:
filename: A string, the filename of an hdf5 file.
name: A name prefix for the IOTensor (optional).

Returns:
A `IOTensor`.

"""
with tf.name_scope(kwargs.get("name", "IOFromHDF5")):
return hdf5_io_tensor_ops.HDF5IOTensor(filename, internal=True)
42 changes: 42 additions & 0 deletions tensorflow_io/core/python/ops/io_tensor_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,48 @@ def __call__(self, column):
spec, self._resource, self._function,
component=column, internal=True)

class _CollectionIOTensor(_IOTensor):
"""_CollectionIOTensor

`CollectionIOTensor` is differnt from `TableIOTensor` in that each
component could have different shapes. While additional table-wide
operations are planned to be supported for `TableIOTensor` so that
the same operations could be applied to every column, there is no plan
to support the same in `CollectionIOTensor`. In other words,
`CollectionIOTensor` is only a dictionary with values consisting
of `BaseIOTensor`.
"""

def __init__(self,
spec,
keys,
resource,
function,
internal=False):
self._keys = keys
self._resource = resource
self._function = function
super(_CollectionIOTensor, self).__init__(
spec, keys, internal=internal)

#=============================================================================
# Accessors
#=============================================================================

@property
def keys(self):
"""The names of columns"""
return self._keys

def __call__(self, key):
"""Return a BaseIOTensor with key named `key`"""
key_index = self.keys.index(
next(e for e in self.keys if e == key))
spec = tf.nest.flatten(self.spec)[key_index]
return BaseIOTensor(
spec, self._resource, self._function,
component=key, internal=True)

class _SeriesIOTensor(_IOTensor):
"""_SeriesIOTensor"""

Expand Down
153 changes: 153 additions & 0 deletions tensorflow_io/hdf5/kernels/hdf5_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ limitations under the License.
==============================================================================*/

#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow_io/core/kernels/io_interface.h"
#include "tensorflow_io/core/kernels/stream.h"

#include <hdf5.h>
#include <hdf5_hl.h>
Expand Down Expand Up @@ -320,5 +322,156 @@ REGISTER_KERNEL_BUILDER(Name("ReadHDF5").Device(DEVICE_CPU),


} // namespace


class HDF5Indexable : public IOIndexableInterface {
public:
HDF5Indexable(Env* env)
: env_(env) {}

~HDF5Indexable() {}
Status Init(const std::vector<string>& input, const std::vector<string>& metadata, const void* memory_data, const int64 memory_size) override {
if (input.size() > 1) {
return errors::InvalidArgument("more than 1 filename is not supported");
}
const string& filename = input[0];
file_.reset(new SizedRandomAccessFile(env_, filename, memory_data, memory_size));
TF_RETURN_IF_ERROR(file_->GetFileSize(&file_size_));

file_image_.reset(new HDF5FileImage(env_, filename, ""));
H5::H5File *file = file_image_->GetFile();
if (file == nullptr) {
return errors::InvalidArgument("unable to open hdf5 file: ", filename);
}

H5O_info_t info;
file->getObjinfo(info);
HDF5Iterate data(info.addr);
herr_t err = H5Literate(file->getId(), H5_INDEX_NAME, H5_ITER_NATIVE, NULL, HDF5Iterate::Iterate, (void *)&data);
for (size_t i = 0; i < data.datasets_.size(); i++) {
columns_.emplace_back(data.datasets_[i]);
columns_index_[data.datasets_[i]] = i;
}

for (size_t i = 0; i < columns_.size(); i++) {
::tensorflow::DataType dtype;
string dataset = columns_[i];
H5::DataSet data_set = file->openDataSet(dataset);

H5::DataSpace data_space = data_set.getSpace();
int rank = data_space.getSimpleExtentNdims();
absl::InlinedVector<hsize_t, 4> dims(rank);
data_space.getSimpleExtentDims(dims.data());

H5::DataType data_type = data_set.getDataType();
hid_t native_type = H5Tget_native_type(data_type.getId(), H5T_DIR_ASCEND);
if (H5Tequal(native_type, H5T_NATIVE_INT)) {
dtype = DT_INT32;
} else if (H5Tequal(native_type, H5T_NATIVE_UINT32)) {
dtype = DT_UINT32;
} else if (H5Tequal(native_type, H5T_NATIVE_LONG)) {
dtype = DT_INT64;
} else if (H5Tequal(native_type, H5T_NATIVE_FLOAT)) {
dtype = DT_FLOAT;
} else if (H5Tequal(native_type, H5T_NATIVE_DOUBLE)) {
dtype = DT_DOUBLE;
} else {
return errors::InvalidArgument("unsupported data type: ", native_type);
}
dtypes_.emplace_back(dtype);
absl::InlinedVector<int64, 4> shape_dims(rank);
for (int r = 0; r < rank; r++) {
shape_dims[r] = dims[r];
}
shapes_.emplace_back(TensorShape(shape_dims));
}
return Status::OK();
}
Status Component(Tensor* component) override {
*component = Tensor(DT_STRING, TensorShape({static_cast<int64>(columns_.size())}));
for (size_t i = 0; i < columns_.size(); i++) {
component->flat<string>()(i) = columns_[i];
}
return Status::OK();
}
Status Spec(const Tensor& component, PartialTensorShape* shape, DataType* dtype) override {
const int64 column_index = columns_index_[component.scalar<string>()()];
*shape = shapes_[column_index];
*dtype = dtypes_[column_index];
return Status::OK();
}

Status GetItem(const int64 start, const int64 stop, const int64 step, const Tensor& component, Tensor* tensor) override {
if (step != 1) {
return errors::InvalidArgument("step ", step, " is not supported");
}
const string& column = component.scalar<string>()();

H5::H5File *file = file_image_->GetFile();
try {
H5::DataSet data_set = file->openDataSet(column);
H5::DataSpace data_space = data_set.getSpace();

int rank = data_space.getSimpleExtentNdims();
absl::InlinedVector<hsize_t, 4> dims(rank);
data_space.getSimpleExtentDims(dims.data());

if (start > dims[0] || stop > dims[0]) {
return errors::InvalidArgument("dataset ", column, " selection is out of boundary");
}
// Find the border of the dims start and dims
absl::InlinedVector<hsize_t, 4> dims_start(dims.size(), 0);
dims_start[0] = start;
dims[0] = stop - start;

H5::DataSpace memory_space(dims.size(), dims.data());

data_space.selectHyperslab(H5S_SELECT_SET, dims.data(), dims_start.data());

H5::DataType data_type = data_set.getDataType();
hid_t native_type = H5Tget_native_type(data_type.getId(), H5T_DIR_ASCEND);
if (H5Tequal(native_type, H5T_NATIVE_INT)) {
data_set.read(tensor->flat<int32>().data(), H5::PredType::NATIVE_INT, memory_space, data_space);
} else if (H5Tequal(native_type, H5T_NATIVE_UINT32)) {
data_set.read(tensor->flat<uint32>().data(), H5::PredType::NATIVE_UINT32, memory_space, data_space);
} else if (H5Tequal(native_type, H5T_NATIVE_LONG)) {
data_set.read(tensor->flat<int64>().data(), H5::PredType::NATIVE_LONG, memory_space, data_space);
} else if (H5Tequal(native_type, H5T_NATIVE_FLOAT)) {
data_set.read(tensor->flat<float>().data(), H5::PredType::NATIVE_FLOAT, memory_space, data_space);
} else if (H5Tequal(native_type, H5T_NATIVE_DOUBLE)) {
data_set.read(tensor->flat<double>().data(), H5::PredType::NATIVE_DOUBLE, memory_space, data_space);
} else {
return errors::Unimplemented("data type not supported yet: ", data_set.getTypeClass());
}
} catch(H5::FileIException e){
return errors::InvalidArgument("unable to open dataset", e.getCDetailMsg());
}

return Status::OK();
}

string DebugString() const override {
mutex_lock l(mu_);
return strings::StrCat("HDF5Indexable");
}
private:
mutable mutex mu_;
Env* env_ GUARDED_BY(mu_);
std::unique_ptr<SizedRandomAccessFile> file_ GUARDED_BY(mu_);
uint64 file_size_ GUARDED_BY(mu_);
std::unique_ptr<HDF5FileImage> file_image_;

std::vector<DataType> dtypes_;
std::vector<TensorShape> shapes_;
std::vector<string> columns_;
std::unordered_map<string, int64> columns_index_;
};

REGISTER_KERNEL_BUILDER(Name("HDF5IndexableInit").Device(DEVICE_CPU),
IOInterfaceInitOp<HDF5Indexable>);
REGISTER_KERNEL_BUILDER(Name("HDF5IndexableSpec").Device(DEVICE_CPU),
IOInterfaceSpecOp<HDF5Indexable>);
REGISTER_KERNEL_BUILDER(Name("HDF5IndexableGetItem").Device(DEVICE_CPU),
IOIndexableGetItemOp<HDF5Indexable>);
} // namespace data
} // namespace tensorflow
40 changes: 40 additions & 0 deletions tensorflow_io/hdf5/ops/hdf5_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,46 @@ limitations under the License.

namespace tensorflow {

REGISTER_OP("HDF5IndexableInit")
.Input("input: string")
.Output("output: resource")
.Output("component: string")
.Attr("container: string = ''")
.Attr("shared_name: string = ''")
.SetShapeFn([](shape_inference::InferenceContext* c) {
c->set_output(0, c->Scalar());
c->set_output(1, c->MakeShape({}));
return Status::OK();
});
REGISTER_OP("HDF5IndexableSpec")
.Input("input: resource")
.Input("component: string")
.Output("shape: int64")
.Output("dtype: int64")
.SetShapeFn([](shape_inference::InferenceContext* c) {
c->set_output(0, c->MakeShape({c->UnknownDim()}));
c->set_output(1, c->MakeShape({}));
return Status::OK();
});

REGISTER_OP("HDF5IndexableGetItem")
.Input("input: resource")
.Input("start: int64")
.Input("stop: int64")
.Input("step: int64")
.Input("component: string")
.Output("output: dtype")
.Attr("shape: shape")
.Attr("dtype: type")
.SetShapeFn([](shape_inference::InferenceContext* c) {
PartialTensorShape shape;
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
shape_inference::ShapeHandle entry;
TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &entry));
c->set_output(0, entry);
return Status::OK();
});

REGISTER_OP("ListHDF5Datasets")
.Input("filename: string")
.Input("memory: string")
Expand Down
8 changes: 8 additions & 0 deletions tensorflow_io/hdf5/python/ops/hdf5_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,18 @@
from __future__ import division
from __future__ import print_function

import warnings

import tensorflow as tf
from tensorflow_io.core.python.ops import core_ops
from tensorflow_io.core.python.ops import data_ops

warnings.warn(
"The tensorflow_io.hdf5.HDF5Dataset is "
"deprecated. Please look for tfio.IOTensor.from_hdf5 "
"for reading HDF5 files into tensorflow.",
DeprecationWarning)

def list_hdf5_datasets(filename, **kwargs):
"""list_hdf5_datasets"""
if not tf.executing_eagerly():
Expand Down
Loading