Add tfio.IOTensor.from_parquet support

yongtang · yongtang · commit 200cd33ebe26 · 2019-08-25T18:20:16.000Z
Note: this PR depends on PR 438.

Parquet columnar file format that naturally fits into
a table/column data. Since Parquet file itself is indexable,
degenerating parquet into an iterable dataset is not desirable
as it loses convenience and flexibility.

This PR adds tfio.IOTensor.from_parquet support so that it is possible
to acess parquet data through natual `__getitem__` operations.

Signed-off-by: Yong Tang &lt;yong.tang.github@outlook.com&gt;
diff --git a/tensorflow_io/core/python/ops/io_tensor.py b/tensorflow_io/core/python/ops/io_tensor.py
@@ -23,6 +23,7 @@
 from tensorflow_io.core.python.ops import json_io_tensor_ops
 from tensorflow_io.core.python.ops import kafka_io_tensor_ops
 from tensorflow_io.core.python.ops import prometheus_io_tensor_ops
+from tensorflow_io.core.python.ops import parquet_io_tensor_ops
 
 class IOTensor(io_tensor_ops._IOTensor):  # pylint: disable=protected-access
   """IOTensor
@@ -287,3 +288,20 @@ def from_prometheus(cls,
     with tf.name_scope(kwargs.get("name", "IOFromPrometheus")):
       return prometheus_io_tensor_ops.PrometheusIOTensor(
           query, endpoint=kwargs.get("endpoint", None), internal=True)
+
+  @classmethod
+  def from_parquet(cls,
+                   filename,
+                   **kwargs):
+    """Creates an `IOTensor` from a parquet file.
+
+    Args:
+      filename: A string, the filename of a parquet file.
+      name: A name prefix for the IOTensor (optional).
+
+    Returns:
+      A `IOTensor`.
+
+    """
+    with tf.name_scope(kwargs.get("name", "IOFromParquet")):
+      return parquet_io_tensor_ops.ParquetIOTensor(filename, internal=True)
diff --git a/tensorflow_io/core/python/ops/parquet_io_tensor_ops.py b/tensorflow_io/core/python/ops/parquet_io_tensor_ops.py
@@ -0,0 +1,51 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ParquetIOTensor"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import uuid
+
+import tensorflow as tf
+from tensorflow_io.core.python.ops import io_tensor_ops
+from tensorflow_io.core.python.ops import core_ops
+
+class ParquetIOTensor(io_tensor_ops._TableIOTensor): # pylint: disable=protected-access
+  """ParquetIOTensor"""
+
+  #=============================================================================
+  # Constructor (private)
+  #=============================================================================
+  def __init__(self,
+               filename,
+               internal=False):
+    with tf.name_scope("ParquetIOTensor") as scope:
+      resource, shapes, dtypes, columns = core_ops.parquet_indexable_init(
+          filename,
+          container=scope,
+          shared_name="%s/%s" % (filename, uuid.uuid4().hex))
+      shapes = [
+          tf.TensorShape(
+              [None if dim < 0 else dim for dim in e.numpy() if dim != 0]
+          ) for e in tf.unstack(shapes)]
+      dtypes = [tf.as_dtype(e.numpy()) for e in tf.unstack(dtypes)]
+      columns = [e.numpy().decode() for e in tf.unstack(columns)]
+      spec = tuple([tf.TensorSpec(shape, dtype, column) for (
+          shape, dtype, column) in zip(shapes, dtypes, columns)])
+      super(ParquetIOTensor, self).__init__(
+          spec, columns,
+          resource, core_ops.parquet_indexable_get_item,
+          internal=internal)
diff --git a/tensorflow_io/parquet/kernels/parquet_kernels.cc b/tensorflow_io/parquet/kernels/parquet_kernels.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow_io/arrow/kernels/arrow_kernels.h"
+#include "tensorflow_io/core/kernels/io_interface.h"
 #include "parquet/api/reader.h"
 
 namespace tensorflow {
@@ -218,5 +219,173 @@ REGISTER_KERNEL_BUILDER(Name("ReadParquet").Device(DEVICE_CPU),
 
 
 }  // namespace
+
+
+class ParquetIndexable : public IOIndexableInterface {
+ public:
+  ParquetIndexable(Env* env)
+  : env_(env) {}
+
+  ~ParquetIndexable() {}
+  Status Init(const std::vector<string>& input, const std::vector<string>& metadata, const void* memory_data, const int64 memory_size) override {
+    if (input.size() > 1) {
+      return errors::InvalidArgument("more than 1 filename is not supported");
+    }
+    const string& filename = input[0];
+    file_.reset(new SizedRandomAccessFile(env_, filename, memory_data, memory_size));
+    TF_RETURN_IF_ERROR(file_->GetFileSize(&file_size_));
+
+    parquet_file_.reset(new ArrowRandomAccessFile(file_.get(), file_size_));
+    parquet_reader_ = parquet::ParquetFileReader::Open(parquet_file_);
+    parquet_metadata_ = parquet_reader_->metadata();
+
+    shapes_.clear();
+    dtypes_.clear();
+    columns_.clear();
+    for (size_t i = 0; i < parquet_metadata_->num_columns(); i++) {
+      ::tensorflow::DataType dtype;
+      switch(parquet_metadata_->schema()->Column(i)->physical_type()) {
+      case parquet::Type::BOOLEAN:
+        dtype = ::tensorflow::DT_BOOL;
+        break;
+      case parquet::Type::INT32:
+        dtype = ::tensorflow::DT_INT32;
+        break;
+      case parquet::Type::INT64:
+        dtype = ::tensorflow::DT_INT64;
+        break;
+      case parquet::Type::INT96: // Deprecated, thrown out exception when access with __getitem__
+        dtype = ::tensorflow::DT_INT64;
+        break;
+      case parquet::Type::FLOAT:
+        dtype = ::tensorflow::DT_FLOAT;
+        break;
+      case parquet::Type::DOUBLE:
+        dtype = ::tensorflow::DT_DOUBLE;
+        break;
+      case parquet::Type::BYTE_ARRAY:
+        dtype = ::tensorflow::DT_STRING;
+        break;
+      case parquet::Type::FIXED_LEN_BYTE_ARRAY:
+        dtype = ::tensorflow::DT_STRING;
+        break;
+      default:
+        return errors::InvalidArgument("parquet data type is not supported: ", parquet_metadata_->schema()->Column(i)->physical_type());
+        break;
+      }
+      shapes_.push_back(TensorShape({static_cast<int64>(parquet_metadata_->num_rows())}));
+      dtypes_.push_back(dtype);
+      columns_.push_back(parquet_metadata_->schema()->Column(i)->path().get()->ToDotString());
+    }
+
+    return Status::OK();
+  }
+  Status Spec(std::vector<PartialTensorShape>& shapes, std::vector<DataType>& dtypes) override {
+    shapes.clear();
+    for (size_t i = 0; i < shapes_.size(); i++) {
+      shapes.push_back(shapes_[i]);
+    }
+    dtypes.clear();
+    for (size_t i = 0; i < dtypes_.size(); i++) {
+      dtypes.push_back(dtypes_[i]);
+    }
+    return Status::OK();
+  }
+
+  Status Extra(std::vector<Tensor>* extra) override {
+    // Expose columns
+    Tensor columns(DT_STRING, TensorShape({static_cast<int64>(columns_.size())}));
+    for (size_t i = 0; i < columns_.size(); i++) {
+      columns.flat<string>()(i) = columns_[i];
+    }
+    extra->push_back(columns);
+    return Status::OK();
+  }
+
+  Status GetItem(const int64 start, const int64 stop, const int64 step, const int64 component, Tensor* tensor) override {
+    if (step != 1) {
+      return errors::InvalidArgument("step ", step, " is not supported");
+    }
+    int64 row_group_offset = 0;
+    for (int row_group = 0; row_group < parquet_metadata_->num_row_groups(); row_group++) {
+      std::shared_ptr<parquet::RowGroupReader> row_group_reader = parquet_reader_->RowGroup(row_group);
+      // Skip if row group is not within [start..stop]
+      if ((row_group_offset + row_group_reader->metadata()->num_rows() < start) || (stop <= row_group_offset)) {
+        row_group_offset += row_group_reader->metadata()->num_rows();
+        continue;
+      }
+      // Find row_to_read range
+      int64 row_to_read_start = row_group_offset > start ? row_group_offset : start;
+      int64 row_to_read_final = (row_group_offset + row_group_reader->metadata()->num_rows()) < (stop) ? (row_group_offset + row_group_reader->metadata()->num_rows()) : (stop);
+      int64 row_to_read_count = row_to_read_final - row_to_read_start;
+
+      // TODO: parquet is RowGroup based so ideally the RowGroup should be cached
+      // with the hope of indexing and slicing happens on each row. For now no caching
+      // is done yet.
+      std::shared_ptr<parquet::ColumnReader> column_reader = row_group_reader->Column(component);
+
+      // buffer to fill location is tensor.data()[row_to_read_start - start]
+
+      #define PARQUET_PROCESS_TYPE(ptype, type) { \
+          parquet::TypedColumnReader<ptype>* reader = \
+              static_cast<parquet::TypedColumnReader<ptype>*>( \
+                  column_reader.get()); \
+          if (row_to_read_start > row_group_offset) { \
+            reader->Skip(row_to_read_start - row_group_offset); \
+          } \
+          ptype::c_type* value = (ptype::c_type *)(void *)(&(tensor->flat<type>().data()[row_to_read_start - start])); \
+          int64_t values_read; \
+          int64_t levels_read = reader->ReadBatch(row_to_read_count, nullptr, nullptr, value, &values_read); \
+          if (!(levels_read == values_read && levels_read == row_to_read_count)) { \
+            return errors::InvalidArgument("null value in column: ", columns_[component]); \
+          } \
+        }
+      switch (parquet_metadata_->schema()->Column(component)->physical_type()) {
+      case parquet::Type::BOOLEAN:
+        PARQUET_PROCESS_TYPE(parquet::BooleanType, bool);
+        break;
+      case parquet::Type::INT32:
+        PARQUET_PROCESS_TYPE(parquet::Int32Type, int32);
+        break;
+      case parquet::Type::INT64:
+        PARQUET_PROCESS_TYPE(parquet::Int64Type, int64);
+          break;
+      case parquet::Type::FLOAT:
+        PARQUET_PROCESS_TYPE(parquet::FloatType, float);
+        break;
+      case parquet::Type::DOUBLE:
+        PARQUET_PROCESS_TYPE(parquet::DoubleType, double);
+        break;
+      default:
+        return errors::InvalidArgument("invalid data type: ", parquet_metadata_->schema()->Column(component)->physical_type());
+      }
+      row_group_offset += row_group_reader->metadata()->num_rows();
+    }
+    return Status::OK();
+  }
+
+  string DebugString() const override {
+    mutex_lock l(mu_);
+    return strings::StrCat("ParquetIndexable");
+  }
+ private:
+  mutable mutex mu_;
+  Env* env_ GUARDED_BY(mu_);
+  std::unique_ptr<SizedRandomAccessFile> file_ GUARDED_BY(mu_);
+  uint64 file_size_ GUARDED_BY(mu_);
+  std::shared_ptr<ArrowRandomAccessFile> parquet_file_;
+  std::unique_ptr<::parquet::ParquetFileReader> parquet_reader_;
+  std::shared_ptr<::parquet::FileMetaData> parquet_metadata_;
+
+  std::vector<DataType> dtypes_;
+  std::vector<TensorShape> shapes_;
+  std::vector<string> columns_;
+  std::vector<int> columns_index_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParquetIndexableInit").Device(DEVICE_CPU),
+                        IOInterfaceInitOp<ParquetIndexable>);
+REGISTER_KERNEL_BUILDER(Name("ParquetIndexableGetItem").Device(DEVICE_CPU),
+                        IOIndexableGetItemOp<ParquetIndexable>);
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow_io/parquet/ops/parquet_ops.cc b/tensorflow_io/parquet/ops/parquet_ops.cc
@@ -19,6 +19,41 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("ParquetIndexableInit")
+  .Input("input: string")
+  .Output("output: resource")
+  .Output("shapes: int64")
+  .Output("dtypes: int64")
+  .Output("columns: string")
+  .Attr("container: string = ''")
+  .Attr("shared_name: string = ''")
+  .SetIsStateful()
+  .SetShapeFn([](shape_inference::InferenceContext* c) {
+    c->set_output(0, c->Scalar());
+    c->set_output(1, c->MakeShape({c->UnknownDim()}));
+    c->set_output(2, c->MakeShape({c->UnknownDim(), c->UnknownDim()}));
+    c->set_output(3, c->MakeShape({c->UnknownDim()}));
+    return Status::OK();
+   });
+
+REGISTER_OP("ParquetIndexableGetItem")
+  .Input("input: resource")
+  .Input("start: int64")
+  .Input("stop: int64")
+  .Input("step: int64")
+  .Input("component: int64")
+  .Output("output: dtype")
+  .Attr("shape: shape")
+  .Attr("dtype: type")
+  .SetShapeFn([](shape_inference::InferenceContext* c) {
+    PartialTensorShape shape;
+    TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+    shape_inference::ShapeHandle entry;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &entry));
+    c->set_output(0, entry);
+    return Status::OK();
+   });
+
 REGISTER_OP("ListParquetColumns")
     .Input("filename: string")
     .Input("memory: string")
diff --git a/tensorflow_io/parquet/python/ops/parquet_ops.py b/tensorflow_io/parquet/python/ops/parquet_ops.py
@@ -17,10 +17,18 @@
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 import tensorflow as tf
 from tensorflow_io.core.python.ops import core_ops as parquet_ops
 from tensorflow_io.core.python.ops import data_ops
 
+warnings.warn(
+    "The tensorflow_io.parquet.ParquetDataset is "
+    "deprecated. Please look for tfio.IOTensor.from_parquet "
+    "for reading parquet files into tensorflow.",
+    DeprecationWarning)
+
 def list_parquet_columns(filename, **kwargs):
   """list_parquet_columns"""
   if not tf.executing_eagerly():
diff --git a/tests/test_parquet_eager.py b/tests/test_parquet_eager.py
@@ -24,7 +24,7 @@
 import tensorflow as tf
 if not (hasattr(tf, "version") and tf.version.VERSION.startswith("2.")):
   tf.compat.v1.enable_eager_execution()
-import tensorflow_io.parquet as parquet_io # pylint: disable=wrong-import-position
+import tensorflow_io as tfio # pylint: disable=wrong-import-position
 
 # Note: The sample file is generated from:
 # `parquet-cpp/examples/low-level-api/reader_writer`
@@ -47,18 +47,27 @@ def test_parquet():
       "parquet_cpp_example.parquet")
   filename = "file://" + filename
 
-  specs = parquet_io.list_parquet_columns(filename)
+  parquet = tfio.IOTensor.from_parquet(filename)
   columns = [
       'boolean_field',
       'int32_field',
       'int64_field',
+      'int96_field',
       'float_field',
-      'double_field']
-  p0 = parquet_io.read_parquet(filename, specs['boolean_field'])
-  p1 = parquet_io.read_parquet(filename, specs['int32_field'])
-  p2 = parquet_io.read_parquet(filename, specs['int64_field'])
-  p4 = parquet_io.read_parquet(filename, specs['float_field'])
-  p5 = parquet_io.read_parquet(filename, specs['double_field'])
+      'double_field',
+      'ba_field',
+      'flba_field']
+  assert parquet.columns == columns
+  p0 = parquet('boolean_field')
+  p1 = parquet('int32_field')
+  p2 = parquet('int64_field')
+  p4 = parquet('float_field')
+  p5 = parquet('double_field')
+  assert p0.dtype == tf.bool
+  assert p1.dtype == tf.int32
+  assert p2.dtype == tf.int64
+  assert p4.dtype == tf.float32
+  assert p5.dtype == tf.float64
 
   for i in range(500): # 500 rows.
     v0 = ((i % 2) == 0)
@@ -72,24 +81,5 @@ def test_parquet():
     assert np.isclose(v4, p4[i].numpy())
     assert np.isclose(v5, p5[i].numpy())
 
-  dataset = tf.compat.v2.data.Dataset.zip(
-      tuple(
-          [parquet_io.ParquetDataset(filename, column) for column in columns])
-  ).apply(tf.data.experimental.unbatch())
-  i = 0
-  for p in dataset:
-    v0 = ((i % 2) == 0)
-    v1 = i
-    v2 = i * 1000 * 1000 * 1000 * 1000
-    v4 = 1.1 * i
-    v5 = 1.1111111 * i
-    p0, p1, p2, p4, p5 = p
-    assert v0 == p0.numpy()
-    assert v1 == p1.numpy()
-    assert v2 == p2.numpy()
-    assert np.isclose(v4, p4.numpy())
-    assert np.isclose(v5, p5.numpy())
-    i += 1
-
 if __name__ == "__main__":
   test.main()