tensorflow · terrytangyuan · Apr 17, 2019 · Apr 15, 2019 · Apr 16, 2019 · Apr 16, 2019
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 /.bazelrc
 /bazel-*
 /artifacts
+/get-pip.py
 .DS_Store
 *.pyc
 __pycache__

diff --git a/.travis.yml b/.travis.yml
@@ -45,24 +45,6 @@ jobs:
     script:
     - docker run -i -t --rm -v $PWD:/v -v $PWD/.cache/pip/:/root/.cache/pip -w /v --net=host buildpack-deps:14.04 bash -x -e .travis/python.release.sh "${TENSORFLOW_INSTALL}" python python3.6
     - docker run -i -t --rm -v $PWD:/v -v $PWD/.cache/pip/:/root/.cache/pip -w /v --net=host -e GITHUB_PAT=9eecea9200150af1ec29f70bb067575eb2e56fc7 buildpack-deps:18.04 bash -x -e .travis/wheel.test.sh
-  # Developer Builds make sure the source code of the repo could be
-  # build and run on commodity developer environment (Ubuntu 16.04/18.04).
-  - stage: build
-    name: "Developer Build on Ubuntu 16.04"
-    before_script: &developer_build
-    - |
-      echo "bash -x -e .travis/bazel.configure.sh \"${TENSORFLOW_INSTALL}\"" > script.sh
-      echo "bash -x -e .travis/bazel.build.sh" >> script.sh
-      echo "bash -x -e .travis/build.test.sh \"${TENSORFLOW_INSTALL}\"" >> script.sh
-    - cat script.sh
-    script:
-    - docker run -i -t --rm -v $PWD:/v -v $PWD/.cache/pip/:/root/.cache/pip -w /v --net=host buildpack-deps:16.04 bash -x -e script.sh
-  - stage: build
-    name: "Developer Build on Ubuntu 18.04"
-    before_script: *developer_build
-    script:
-    - docker run -i -t --rm -v $PWD:/v -v $PWD/.cache/pip/:/root/.cache/pip -w /v --net=host buildpack-deps:18.04 bash -x -e script.sh
-
   # Preview Release Builds are for TensorFlow 2.0 Preview release.
   # Note only Linux (Ubuntu 18.04) and macOS are supported.
   - stage: release

diff --git a/tensorflow_io/core/kernels/dataset_ops.h b/tensorflow_io/core/kernels/dataset_ops.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/lib/io/inputstream_interface.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_inputstream.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 
 namespace tensorflow {
@@ -231,20 +233,20 @@ class DataInputOp: public OpKernel {
       OP_REQUIRES_OK(ctx, env_->NewRandomAccessFile(filename, &file));
       if (filters_.size() == 0) {
         // No filter means only a file stream.
-        io::RandomAccessInputStream s(file.get());
+        io::RandomAccessInputStream file_stream(file.get());
         T entry;
-        OP_REQUIRES_OK(ctx, entry.FromInputStream(s, filename, string(""), string("")));
+        OP_REQUIRES_OK(ctx, entry.FromInputStream(file_stream, filename, string(""), string("")));
         output.emplace_back(std::move(entry));
         continue;
       }
 
       std::unique_ptr<struct archive, void(*)(struct archive *)> archive(archive_read_new(), [](struct archive *a){ archive_read_free(a);});
       OP_REQUIRES_OK(ctx, ArchiveInputStream::SetupFilters(archive.get(), filters_));
 
-      ArchiveInputStream s(file.get(), archive.get());
+      ArchiveInputStream archive_stream(file.get(), archive.get());
 
       OP_REQUIRES(
-          ctx, (archive_read_open(archive.get(), &s, NULL, ArchiveInputStream::CallbackRead, NULL) == ARCHIVE_OK), 
+          ctx, (archive_read_open(archive.get(), &archive_stream, NULL, ArchiveInputStream::CallbackRead, NULL) == ARCHIVE_OK),
           errors::InvalidArgument("unable to open datainput for ", filename, ": ", archive_error_string(archive.get())));
 
       size_t index = output.size();
@@ -254,9 +256,26 @@ class DataInputOp: public OpKernel {
         string entryname = archive_entry_pathname(entry);
 	string filtername;
         if (ArchiveInputStream::MatchFilters(archive.get(), entryname, filters_, &filtername)) {
-          s.ResetEntryOffset();
 	  T entry;
-          OP_REQUIRES_OK(ctx, entry.FromInputStream(s, filename, entryname, filtername));
+	  if (filtername == "none") {
+            // If filter is none, then just use the initial stream.
+	    // NOTE: Looks like libarchive may not be able to handle
+	    // none with text type correctly (not reading data in none archive)
+	    // So use the shortcut here.
+            io::RandomAccessInputStream file_stream(file.get());
+            OP_REQUIRES_OK(ctx, entry.FromInputStream(file_stream, filename, entryname, filtername));
+	  } else if (filtername == "gz") {
+            // Treat gz file specially. Looks like libarchive always have issue
+            // with text file so use ZlibInputStream. Now libarchive
+            // is mostly used for archive (not compressio).
+            io::RandomAccessInputStream file_stream(file.get());
+            io::ZlibCompressionOptions zlib_compression_options = zlib_compression_options = io::ZlibCompressionOptions::GZIP();
+            io::ZlibInputStream compression_stream(&file_stream, 65536, 65536,  zlib_compression_options);
+            OP_REQUIRES_OK(ctx, entry.FromInputStream(compression_stream, filename, entryname, filtername));
+          } else {
+            archive_stream.ResetEntryOffset();
+            OP_REQUIRES_OK(ctx, entry.FromInputStream(archive_stream, filename, entryname, filtername));
+          }
           output.emplace_back(std::move(entry));
 	}
       }
@@ -374,10 +393,21 @@ class InputDatasetBase : public DatasetBase {
       current_input_state_.reset(nullptr);
 
       TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file_));
-      if (filtername.size() == 0) {
-        // No filter means only a file stream.
+      if (filtername.size() == 0 || filtername == "none") {
+	// If filter is none, then just use the initial stream.
+	// NOTE: Looks like libarchive may not be able to handle
+	// none with text type correctly (not reading data in none archive)
+	// So use the shortcut here.
         stream_.reset(new io::RandomAccessInputStream(file_.get()));
         return Status::OK();
+      } else if (filtername == "gz") {
+        // Treat gz file specially. Looks like libarchive always have issue
+	// with text file so use ZlibInputStream. Now libarchive
+	// is mostly used for archive (not compressio).
+	io::ZlibCompressionOptions zlib_compression_options = zlib_compression_options = io::ZlibCompressionOptions::GZIP();
+        file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
+	stream_.reset(new io::ZlibInputStream(file_stream_.get(), 65536, 65536,  zlib_compression_options));
+        return Status::OK();
       }
       archive_.reset(archive_read_new());
 
@@ -405,6 +435,7 @@ class InputDatasetBase : public DatasetBase {
       current_input_state_.reset(nullptr);
       stream_.reset(nullptr);
       archive_.reset(nullptr);
+      file_stream_.reset(nullptr);
       file_.reset(nullptr);
     }
 
@@ -413,6 +444,7 @@ class InputDatasetBase : public DatasetBase {
     std::unique_ptr<StateType> current_input_state_ GUARDED_BY(mu_);
     std::unique_ptr<io::InputStreamInterface> stream_ GUARDED_BY(mu_);
     std::unique_ptr<struct archive, void(*)(struct archive *)> archive_ GUARDED_BY(mu_);
+    std::unique_ptr<io::InputStreamInterface> file_stream_ GUARDED_BY(mu_);
     std::unique_ptr<tensorflow::RandomAccessFile> file_ GUARDED_BY(mu_);
   };
   OpKernelContext* ctx_;

diff --git a/tensorflow_io/mnist/kernels/mnist_dataset_ops.cc b/tensorflow_io/mnist/kernels/mnist_dataset_ops.cc
@@ -26,7 +26,10 @@ class MNISTImageInput: public DataInput<int64> {
       TF_RETURN_IF_ERROR(s.SkipNBytes(16));
     }
     string buffer;
-    TF_RETURN_IF_ERROR(ReadInputStream(s, (rows_ * cols_), 1, &buffer , returned));
+    Status status = ReadInputStream(s, (rows_ * cols_), 1, &buffer , returned);
+    if (!(status.ok() || errors::IsOutOfRange(status))) {
+      return status;
+    }
     (*(state.get())) += *returned;
     if (*returned == 1) {
       Tensor value_tensor(ctx->allocator({}), DT_UINT8, {rows_, cols_});

diff --git a/tensorflow_io/text/BUILD b/tensorflow_io/text/BUILD
@@ -5,6 +5,7 @@ package(default_visibility = ["//visibility:public"])
 cc_binary(
     name = "python/ops/_text_ops.so",
     srcs = [
+        "kernels/text_input.cc",
         "kernels/text_sequence.cc",
         "ops/text_ops.cc",
     ],
@@ -18,7 +19,9 @@ cc_binary(
     ],
     linkshared = 1,
     deps = [
+        "//tensorflow_io/core:dataset_ops",
         "//tensorflow_io/core:sequence_ops",
+        "@libarchive",
         "@local_config_tf//:libtensorflow_framework",
         "@local_config_tf//:tf_header_lib",
     ],

diff --git a/tensorflow_io/text/__init__.py b/tensorflow_io/text/__init__.py
@@ -12,21 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TextOutputSequence
+"""TextInput/TextOutput
 
 @@TextOutputSequence
+@@TextDataset
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow_io.text.python.ops.text_ops import TextOutputSequence
+from tensorflow_io.text.python.ops.text_ops import TextDataset
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     "TextOutputSequence",
+    "TextDataset",
 ]
 
 remove_undocumented(__name__)
diff --git a/tensorflow_io/text/kernels/text_input.cc b/tensorflow_io/text/kernels/text_input.cc
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "kernels/dataset_ops.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+
+namespace tensorflow {
+namespace data {
+
+class TextInput: public DataInput<io::BufferedInputStream> {
+ public:
+  Status ReadRecord(io::InputStreamInterface& s, IteratorContext* ctx, std::unique_ptr<io::BufferedInputStream>& state, int64* returned, std::vector<Tensor>* out_tensors) const override {
+    if (state.get() == nullptr) {
+      state.reset(new io::BufferedInputStream(&s, 4096));
+    }
+    string buffer;
+    Status status = state.get()->ReadLine(&buffer);
+    if (!(status.ok() || errors::IsOutOfRange(status))) {
+      return status;
+    }
+    *returned = (status.ok()) ? 1 : 0;
+    if (*returned == 1) {
+      Tensor value_tensor(ctx->allocator({}), DT_STRING, {});
+      value_tensor.scalar<string>()() = buffer;
+      out_tensors->emplace_back(std::move(value_tensor));
+    }
+    return Status::OK();
+  }
+  Status FromStream(io::InputStreamInterface& s) override {
+    // TODO: Read 4K buffer to detect BOM.
+    //string header;
+    //TF_RETURN_IF_ERROR(s.ReadNBytes(4096, &header));
+    //for (size i = 0; i < header.size(); i++) {
+    //  if (!isprint(header[i])) {
+    //    return errors::InvalidArgument("text file contains character that is non printable at ", i);
+    //  }
+    //}
+    return Status::OK();
+  }
+  void EncodeAttributes(VariantTensorData* data) const override {
+  }
+  bool DecodeAttributes(const VariantTensorData& data) override {
+    return true;
+  }
+ protected:
+};
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TextInput, "tensorflow::data::TextInput");
+
+REGISTER_KERNEL_BUILDER(Name("TextInput").Device(DEVICE_CPU),
+                        DataInputOp<TextInput>);
+REGISTER_KERNEL_BUILDER(Name("TextDataset").Device(DEVICE_CPU),
+                        InputDatasetOp<TextInput, io::BufferedInputStream>);
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow_io/text/ops/text_ops.cc b/tensorflow_io/text/ops/text_ops.cc
@@ -19,6 +19,27 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("TextInput")
+    .Input("source: string")
+    .Output("handle: variant")
+    .Attr("filters: list(string) = []")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+       c->set_output(0, c->MakeShape({c->UnknownDim()}));
+       return Status::OK();
+     });
+
+REGISTER_OP("TextDataset")
+    .Input("input: T")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("T: {string, variant} = DT_VARIANT")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+       c->set_output(0, c->MakeShape({}));
+       return Status::OK();
+     });
+
 REGISTER_OP("TextOutputSequence")
     .Input("destination: string")
     .Output("sequence: resource")

diff --git a/tensorflow_io/text/python/ops/text_ops.py b/tensorflow_io/text/python/ops/text_ops.py
@@ -12,14 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TextOutputSequence."""
+"""TextInput/TextOutput."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import tensorflow
+from tensorflow import dtypes
+from tensorflow.compat.v1 import data
 from tensorflow_io import _load_library
 text_ops = _load_library('_text_ops.so')
 
+class TextDataset(data.Dataset):
+  """A Text Dataset
+  """
+
+  def __init__(self, filename):
+    """Create a Text Reader.
+
+    Args:
+      filename: A `tf.string` tensor containing one or more filenames.
+    """
+    self._data_input = text_ops.text_input(filename, ["none", "gz"])
+    super(TextDataset, self).__init__()
+
+  def _inputs(self):
+    return []
+
+  def _as_variant_tensor(self):
+    return text_ops.text_dataset(
+        self._data_input,
+        output_types=self.output_types,
+        output_shapes=self.output_shapes)
+
+  @property
+  def output_shapes(self):
+    return tuple([tensorflow.TensorShape([])])
+
+  @property
+  def output_classes(self):
+    return tensorflow.Tensor
+
+  @property
+  def output_types(self):
+    return tuple([dtypes.string])
 
 class TextOutputSequence(object):
   """TextOutputSequence"""