tensorflow · yongtang · Sep 11, 2019 · Aug 23, 2019
diff --git a/tensorflow_io/core/python/ops/hdf5_io_tensor_ops.py b/tensorflow_io/core/python/ops/hdf5_io_tensor_ops.py
@@ -0,0 +1,51 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""HDF5IOTensor"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import uuid
+
+import tensorflow as tf
+from tensorflow_io.core.python.ops import io_tensor_ops
+from tensorflow_io.core.python.ops import core_ops
+
+class HDF5IOTensor(io_tensor_ops._CollectionIOTensor): # pylint: disable=protected-access
+  """HDF5IOTensor"""
+
+  #=============================================================================
+  # Constructor (private)
+  #=============================================================================
+  def __init__(self,
+               filename,
+               internal=False):
+    with tf.name_scope("HDF5IOTensor") as scope:
+      resource, columns = core_ops.hdf5_indexable_init(
+          filename,
+          container=scope,
+          shared_name="%s/%s" % (filename, uuid.uuid4().hex))
+      columns = [column.decode() for column in columns.numpy().tolist()]
+      spec = []
+      for column in columns:
+        shape, dtype = core_ops.hdf5_indexable_spec(resource, column)
+        shape = tf.TensorShape(shape)
+        dtype = tf.as_dtype(dtype.numpy())
+        spec.append(tf.TensorSpec(shape, dtype, column))
+      spec = tuple(spec)
+      super(HDF5IOTensor, self).__init__(
+          spec, columns,
+          resource, core_ops.hdf5_indexable_get_item,
+          internal=internal)
diff --git a/tensorflow_io/core/python/ops/io_tensor.py b/tensorflow_io/core/python/ops/io_tensor.py
@@ -21,6 +21,7 @@
 from tensorflow_io.core.python.ops import io_tensor_ops
 from tensorflow_io.core.python.ops import audio_io_tensor_ops
 from tensorflow_io.core.python.ops import json_io_tensor_ops
+from tensorflow_io.core.python.ops import hdf5_io_tensor_ops
 from tensorflow_io.core.python.ops import kafka_io_tensor_ops
 from tensorflow_io.core.python.ops import lmdb_io_tensor_ops
 from tensorflow_io.core.python.ops import prometheus_io_tensor_ops
@@ -346,3 +347,20 @@ def from_lmdb(cls,
     """
     with tf.name_scope(kwargs.get("name", "IOFromLMDB")):
       return lmdb_io_tensor_ops.LMDBIOTensor(filename, internal=True)
+
+  @classmethod
+  def from_hdf5(cls,
+                filename,
+                **kwargs):
+    """Creates an `IOTensor` from an hdf5 file.
+
+    Args:
+      filename: A string, the filename of an hdf5 file.
+      name: A name prefix for the IOTensor (optional).
+
+    Returns:
+      A `IOTensor`.
+
+    """
+    with tf.name_scope(kwargs.get("name", "IOFromHDF5")):
+      return hdf5_io_tensor_ops.HDF5IOTensor(filename, internal=True)
diff --git a/tensorflow_io/core/python/ops/io_tensor_ops.py b/tensorflow_io/core/python/ops/io_tensor_ops.py
@@ -316,6 +316,48 @@ def __call__(self, column):
         spec, self._resource, self._function,
         component=column, internal=True)
 
+class _CollectionIOTensor(_IOTensor):
+  """_CollectionIOTensor
+
+  `CollectionIOTensor` is differnt from `TableIOTensor` in that each
+  component could have different shapes. While additional table-wide
+  operations are planned to be supported for `TableIOTensor` so that
+  the same operations could be applied to every column, there is no plan
+  to support the same in `CollectionIOTensor`. In other words,
+  `CollectionIOTensor` is only a dictionary with values consisting
+  of `BaseIOTensor`.
+  """
+
+  def __init__(self,
+               spec,
+               keys,
+               resource,
+               function,
+               internal=False):
+    self._keys = keys
+    self._resource = resource
+    self._function = function
+    super(_CollectionIOTensor, self).__init__(
+        spec, keys, internal=internal)
+
+  #=============================================================================
+  # Accessors
+  #=============================================================================
+
+  @property
+  def keys(self):
+    """The names of columns"""
+    return self._keys
+
+  def __call__(self, key):
+    """Return a BaseIOTensor with key named `key`"""
+    key_index = self.keys.index(
+        next(e for e in self.keys if e == key))
+    spec = tf.nest.flatten(self.spec)[key_index]
+    return BaseIOTensor(
+        spec, self._resource, self._function,
+        component=key, internal=True)
+
 class _SeriesIOTensor(_IOTensor):
   """_SeriesIOTensor"""
 

diff --git a/tensorflow_io/hdf5/kernels/hdf5_kernels.cc b/tensorflow_io/hdf5/kernels/hdf5_kernels.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow_io/core/kernels/io_interface.h"
+#include "tensorflow_io/core/kernels/stream.h"
 
 #include <hdf5.h>
 #include <hdf5_hl.h>
@@ -320,5 +322,156 @@ REGISTER_KERNEL_BUILDER(Name("ReadHDF5").Device(DEVICE_CPU),
 
 
 }  // namespace
+
+
+class HDF5Indexable : public IOIndexableInterface {
+ public:
+  HDF5Indexable(Env* env)
+  : env_(env) {}
+
+  ~HDF5Indexable() {}
+  Status Init(const std::vector<string>& input, const std::vector<string>& metadata, const void* memory_data, const int64 memory_size) override {
+    if (input.size() > 1) {
+      return errors::InvalidArgument("more than 1 filename is not supported");
+    }
+    const string& filename = input[0];
+    file_.reset(new SizedRandomAccessFile(env_, filename, memory_data, memory_size));
+    TF_RETURN_IF_ERROR(file_->GetFileSize(&file_size_));
+
+    file_image_.reset(new HDF5FileImage(env_, filename, ""));
+    H5::H5File *file = file_image_->GetFile();
+    if (file == nullptr) {
+      return errors::InvalidArgument("unable to open hdf5 file: ", filename);
+    }
+
+    H5O_info_t info;
+    file->getObjinfo(info);
+    HDF5Iterate data(info.addr);
+    herr_t err = H5Literate(file->getId(), H5_INDEX_NAME, H5_ITER_NATIVE, NULL, HDF5Iterate::Iterate, (void *)&data);
+    for (size_t i = 0; i < data.datasets_.size(); i++) {
+      columns_.emplace_back(data.datasets_[i]);
+      columns_index_[data.datasets_[i]] = i;
+    }
+
+    for (size_t i = 0; i < columns_.size(); i++) {
+      ::tensorflow::DataType dtype;
+      string dataset = columns_[i];
+      H5::DataSet data_set = file->openDataSet(dataset);
+
+      H5::DataSpace data_space = data_set.getSpace();
+      int rank = data_space.getSimpleExtentNdims();
+      absl::InlinedVector<hsize_t, 4> dims(rank);
+      data_space.getSimpleExtentDims(dims.data());
+
+      H5::DataType data_type = data_set.getDataType();
+      hid_t native_type = H5Tget_native_type(data_type.getId(), H5T_DIR_ASCEND);
+      if (H5Tequal(native_type, H5T_NATIVE_INT)) {
+        dtype = DT_INT32;
+      } else if (H5Tequal(native_type, H5T_NATIVE_UINT32)) {
+        dtype = DT_UINT32;
+      } else if (H5Tequal(native_type, H5T_NATIVE_LONG)) {
+        dtype = DT_INT64;
+      } else if (H5Tequal(native_type, H5T_NATIVE_FLOAT)) {
+        dtype = DT_FLOAT;
+      } else if (H5Tequal(native_type, H5T_NATIVE_DOUBLE)) {
+        dtype = DT_DOUBLE;
+      } else {
+        return errors::InvalidArgument("unsupported data type: ", native_type);
+      }
+      dtypes_.emplace_back(dtype);
+      absl::InlinedVector<int64, 4> shape_dims(rank);
+      for (int r = 0; r < rank; r++) {
+        shape_dims[r] = dims[r];
+      }
+      shapes_.emplace_back(TensorShape(shape_dims));
+    }
+    return Status::OK();
+  }
+  Status Component(Tensor* component) override {
+    *component = Tensor(DT_STRING, TensorShape({static_cast<int64>(columns_.size())}));
+    for (size_t i = 0; i < columns_.size(); i++) {
+      component->flat<string>()(i) = columns_[i];
+    }
+    return Status::OK();
+  }
+  Status Spec(const Tensor& component, PartialTensorShape* shape, DataType* dtype) override {
+    const int64 column_index = columns_index_[component.scalar<string>()()];
+    *shape = shapes_[column_index];
+    *dtype = dtypes_[column_index];
+    return Status::OK();
+  }
+
+  Status GetItem(const int64 start, const int64 stop, const int64 step, const Tensor& component, Tensor* tensor) override {
+    if (step != 1) {
+      return errors::InvalidArgument("step ", step, " is not supported");
+    }
+    const string& column = component.scalar<string>()();
+
+    H5::H5File *file = file_image_->GetFile();
+    try {
+      H5::DataSet data_set = file->openDataSet(column);
+      H5::DataSpace data_space = data_set.getSpace();
+
+      int rank = data_space.getSimpleExtentNdims();
+      absl::InlinedVector<hsize_t, 4> dims(rank);
+      data_space.getSimpleExtentDims(dims.data());
+
+      if (start > dims[0] || stop > dims[0]) {
+        return errors::InvalidArgument("dataset ", column, " selection is out of boundary");
+      }
+      // Find the border of the dims start and dims
+      absl::InlinedVector<hsize_t, 4> dims_start(dims.size(), 0);
+      dims_start[0] = start;
+      dims[0] = stop - start;
+
+      H5::DataSpace memory_space(dims.size(), dims.data());
+
+      data_space.selectHyperslab(H5S_SELECT_SET, dims.data(), dims_start.data());
+
+      H5::DataType data_type = data_set.getDataType();
+      hid_t native_type = H5Tget_native_type(data_type.getId(), H5T_DIR_ASCEND);
+      if (H5Tequal(native_type, H5T_NATIVE_INT)) {
+        data_set.read(tensor->flat<int32>().data(), H5::PredType::NATIVE_INT, memory_space, data_space);
+      } else if (H5Tequal(native_type, H5T_NATIVE_UINT32)) {
+        data_set.read(tensor->flat<uint32>().data(), H5::PredType::NATIVE_UINT32, memory_space, data_space);
+      } else if (H5Tequal(native_type, H5T_NATIVE_LONG)) {
+        data_set.read(tensor->flat<int64>().data(), H5::PredType::NATIVE_LONG, memory_space, data_space);
+      } else if (H5Tequal(native_type, H5T_NATIVE_FLOAT)) {
+        data_set.read(tensor->flat<float>().data(), H5::PredType::NATIVE_FLOAT, memory_space, data_space);
+      } else if (H5Tequal(native_type, H5T_NATIVE_DOUBLE)) {
+        data_set.read(tensor->flat<double>().data(), H5::PredType::NATIVE_DOUBLE, memory_space, data_space);
+      } else {
+        return errors::Unimplemented("data type not supported yet: ", data_set.getTypeClass());
+      }
+    } catch(H5::FileIException e){
+      return errors::InvalidArgument("unable to open dataset", e.getCDetailMsg());
+    } 
+
+    return Status::OK();
+  }
+
+  string DebugString() const override {
+    mutex_lock l(mu_);
+    return strings::StrCat("HDF5Indexable");
+  }
+ private:
+  mutable mutex mu_;
+  Env* env_ GUARDED_BY(mu_);
+  std::unique_ptr<SizedRandomAccessFile> file_ GUARDED_BY(mu_);
+  uint64 file_size_ GUARDED_BY(mu_);
+  std::unique_ptr<HDF5FileImage> file_image_;
+
+  std::vector<DataType> dtypes_;
+  std::vector<TensorShape> shapes_;
+  std::vector<string> columns_;
+  std::unordered_map<string, int64> columns_index_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("HDF5IndexableInit").Device(DEVICE_CPU),
+                        IOInterfaceInitOp<HDF5Indexable>);
+REGISTER_KERNEL_BUILDER(Name("HDF5IndexableSpec").Device(DEVICE_CPU),
+                        IOInterfaceSpecOp<HDF5Indexable>);
+REGISTER_KERNEL_BUILDER(Name("HDF5IndexableGetItem").Device(DEVICE_CPU),
+                        IOIndexableGetItemOp<HDF5Indexable>);
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow_io/hdf5/ops/hdf5_ops.cc b/tensorflow_io/hdf5/ops/hdf5_ops.cc
@@ -19,6 +19,46 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("HDF5IndexableInit")
+  .Input("input: string")
+  .Output("output: resource")
+  .Output("component: string")
+  .Attr("container: string = ''")
+  .Attr("shared_name: string = ''")
+  .SetShapeFn([](shape_inference::InferenceContext* c) {
+    c->set_output(0, c->Scalar());
+    c->set_output(1, c->MakeShape({}));
+    return Status::OK();
+   });
+REGISTER_OP("HDF5IndexableSpec")
+  .Input("input: resource")
+  .Input("component: string")
+  .Output("shape: int64")
+  .Output("dtype: int64")
+  .SetShapeFn([](shape_inference::InferenceContext* c) {
+    c->set_output(0, c->MakeShape({c->UnknownDim()}));
+    c->set_output(1, c->MakeShape({}));
+    return Status::OK();
+   });
+
+REGISTER_OP("HDF5IndexableGetItem")
+  .Input("input: resource")
+  .Input("start: int64")
+  .Input("stop: int64")
+  .Input("step: int64")
+  .Input("component: string")
+  .Output("output: dtype")
+  .Attr("shape: shape")
+  .Attr("dtype: type")
+  .SetShapeFn([](shape_inference::InferenceContext* c) {
+    PartialTensorShape shape;
+    TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+    shape_inference::ShapeHandle entry;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &entry));
+    c->set_output(0, entry);
+    return Status::OK();
+   });
+
 REGISTER_OP("ListHDF5Datasets")
     .Input("filename: string")
     .Input("memory: string")

diff --git a/tensorflow_io/hdf5/python/ops/hdf5_ops.py b/tensorflow_io/hdf5/python/ops/hdf5_ops.py
@@ -17,10 +17,18 @@
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 import tensorflow as tf
 from tensorflow_io.core.python.ops import core_ops
 from tensorflow_io.core.python.ops import data_ops
 
+warnings.warn(
+    "The tensorflow_io.hdf5.HDF5Dataset is "
+    "deprecated. Please look for tfio.IOTensor.from_hdf5 "
+    "for reading HDF5 files into tensorflow.",
+    DeprecationWarning)
+
 def list_hdf5_datasets(filename, **kwargs):
   """list_hdf5_datasets"""
   if not tf.executing_eagerly():