diff --git a/.gitignore b/.gitignore index 24b0b4682..86f08e816 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ /.bazelrc /bazel-* /artifacts +/get-pip.py .DS_Store *.pyc __pycache__ diff --git a/.travis.yml b/.travis.yml index 7e488b92f..26965a129 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,24 +45,6 @@ jobs: script: - docker run -i -t --rm -v $PWD:/v -v $PWD/.cache/pip/:/root/.cache/pip -w /v --net=host buildpack-deps:14.04 bash -x -e .travis/python.release.sh "${TENSORFLOW_INSTALL}" python python3.6 - docker run -i -t --rm -v $PWD:/v -v $PWD/.cache/pip/:/root/.cache/pip -w /v --net=host -e GITHUB_PAT=9eecea9200150af1ec29f70bb067575eb2e56fc7 buildpack-deps:18.04 bash -x -e .travis/wheel.test.sh - # Developer Builds make sure the source code of the repo could be - # build and run on commodity developer environment (Ubuntu 16.04/18.04). - - stage: build - name: "Developer Build on Ubuntu 16.04" - before_script: &developer_build - - | - echo "bash -x -e .travis/bazel.configure.sh \"${TENSORFLOW_INSTALL}\"" > script.sh - echo "bash -x -e .travis/bazel.build.sh" >> script.sh - echo "bash -x -e .travis/build.test.sh \"${TENSORFLOW_INSTALL}\"" >> script.sh - - cat script.sh - script: - - docker run -i -t --rm -v $PWD:/v -v $PWD/.cache/pip/:/root/.cache/pip -w /v --net=host buildpack-deps:16.04 bash -x -e script.sh - - stage: build - name: "Developer Build on Ubuntu 18.04" - before_script: *developer_build - script: - - docker run -i -t --rm -v $PWD:/v -v $PWD/.cache/pip/:/root/.cache/pip -w /v --net=host buildpack-deps:18.04 bash -x -e script.sh - # Preview Release Builds are for TensorFlow 2.0 Preview release. # Note only Linux (Ubuntu 18.04) and macOS are supported. - stage: release diff --git a/tensorflow_io/core/kernels/dataset_ops.h b/tensorflow_io/core/kernels/dataset_ops.h index f230f915b..318e25d94 100644 --- a/tensorflow_io/core/kernels/dataset_ops.h +++ b/tensorflow_io/core/kernels/dataset_ops.h @@ -21,6 +21,8 @@ limitations under the License. #include "tensorflow/core/framework/dataset.h" #include "tensorflow/core/lib/io/inputstream_interface.h" #include "tensorflow/core/lib/io/random_inputstream.h" +#include "tensorflow/core/lib/io/zlib_compression_options.h" +#include "tensorflow/core/lib/io/zlib_inputstream.h" #include "tensorflow/core/framework/variant_op_registry.h" namespace tensorflow { @@ -231,9 +233,9 @@ class DataInputOp: public OpKernel { OP_REQUIRES_OK(ctx, env_->NewRandomAccessFile(filename, &file)); if (filters_.size() == 0) { // No filter means only a file stream. - io::RandomAccessInputStream s(file.get()); + io::RandomAccessInputStream file_stream(file.get()); T entry; - OP_REQUIRES_OK(ctx, entry.FromInputStream(s, filename, string(""), string(""))); + OP_REQUIRES_OK(ctx, entry.FromInputStream(file_stream, filename, string(""), string(""))); output.emplace_back(std::move(entry)); continue; } @@ -241,10 +243,10 @@ class DataInputOp: public OpKernel { std::unique_ptr archive(archive_read_new(), [](struct archive *a){ archive_read_free(a);}); OP_REQUIRES_OK(ctx, ArchiveInputStream::SetupFilters(archive.get(), filters_)); - ArchiveInputStream s(file.get(), archive.get()); + ArchiveInputStream archive_stream(file.get(), archive.get()); OP_REQUIRES( - ctx, (archive_read_open(archive.get(), &s, NULL, ArchiveInputStream::CallbackRead, NULL) == ARCHIVE_OK), + ctx, (archive_read_open(archive.get(), &archive_stream, NULL, ArchiveInputStream::CallbackRead, NULL) == ARCHIVE_OK), errors::InvalidArgument("unable to open datainput for ", filename, ": ", archive_error_string(archive.get()))); size_t index = output.size(); @@ -254,9 +256,26 @@ class DataInputOp: public OpKernel { string entryname = archive_entry_pathname(entry); string filtername; if (ArchiveInputStream::MatchFilters(archive.get(), entryname, filters_, &filtername)) { - s.ResetEntryOffset(); T entry; - OP_REQUIRES_OK(ctx, entry.FromInputStream(s, filename, entryname, filtername)); + if (filtername == "none") { + // If filter is none, then just use the initial stream. + // NOTE: Looks like libarchive may not be able to handle + // none with text type correctly (not reading data in none archive) + // So use the shortcut here. + io::RandomAccessInputStream file_stream(file.get()); + OP_REQUIRES_OK(ctx, entry.FromInputStream(file_stream, filename, entryname, filtername)); + } else if (filtername == "gz") { + // Treat gz file specially. Looks like libarchive always have issue + // with text file so use ZlibInputStream. Now libarchive + // is mostly used for archive (not compressio). + io::RandomAccessInputStream file_stream(file.get()); + io::ZlibCompressionOptions zlib_compression_options = zlib_compression_options = io::ZlibCompressionOptions::GZIP(); + io::ZlibInputStream compression_stream(&file_stream, 65536, 65536, zlib_compression_options); + OP_REQUIRES_OK(ctx, entry.FromInputStream(compression_stream, filename, entryname, filtername)); + } else { + archive_stream.ResetEntryOffset(); + OP_REQUIRES_OK(ctx, entry.FromInputStream(archive_stream, filename, entryname, filtername)); + } output.emplace_back(std::move(entry)); } } @@ -374,10 +393,21 @@ class InputDatasetBase : public DatasetBase { current_input_state_.reset(nullptr); TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file_)); - if (filtername.size() == 0) { - // No filter means only a file stream. + if (filtername.size() == 0 || filtername == "none") { + // If filter is none, then just use the initial stream. + // NOTE: Looks like libarchive may not be able to handle + // none with text type correctly (not reading data in none archive) + // So use the shortcut here. stream_.reset(new io::RandomAccessInputStream(file_.get())); return Status::OK(); + } else if (filtername == "gz") { + // Treat gz file specially. Looks like libarchive always have issue + // with text file so use ZlibInputStream. Now libarchive + // is mostly used for archive (not compressio). + io::ZlibCompressionOptions zlib_compression_options = zlib_compression_options = io::ZlibCompressionOptions::GZIP(); + file_stream_.reset(new io::RandomAccessInputStream(file_.get())); + stream_.reset(new io::ZlibInputStream(file_stream_.get(), 65536, 65536, zlib_compression_options)); + return Status::OK(); } archive_.reset(archive_read_new()); @@ -405,6 +435,7 @@ class InputDatasetBase : public DatasetBase { current_input_state_.reset(nullptr); stream_.reset(nullptr); archive_.reset(nullptr); + file_stream_.reset(nullptr); file_.reset(nullptr); } @@ -413,6 +444,7 @@ class InputDatasetBase : public DatasetBase { std::unique_ptr current_input_state_ GUARDED_BY(mu_); std::unique_ptr stream_ GUARDED_BY(mu_); std::unique_ptr archive_ GUARDED_BY(mu_); + std::unique_ptr file_stream_ GUARDED_BY(mu_); std::unique_ptr file_ GUARDED_BY(mu_); }; OpKernelContext* ctx_; diff --git a/tensorflow_io/mnist/kernels/mnist_dataset_ops.cc b/tensorflow_io/mnist/kernels/mnist_dataset_ops.cc index f7ed48f7b..f9bce6651 100644 --- a/tensorflow_io/mnist/kernels/mnist_dataset_ops.cc +++ b/tensorflow_io/mnist/kernels/mnist_dataset_ops.cc @@ -26,7 +26,10 @@ class MNISTImageInput: public DataInput { TF_RETURN_IF_ERROR(s.SkipNBytes(16)); } string buffer; - TF_RETURN_IF_ERROR(ReadInputStream(s, (rows_ * cols_), 1, &buffer , returned)); + Status status = ReadInputStream(s, (rows_ * cols_), 1, &buffer , returned); + if (!(status.ok() || errors::IsOutOfRange(status))) { + return status; + } (*(state.get())) += *returned; if (*returned == 1) { Tensor value_tensor(ctx->allocator({}), DT_UINT8, {rows_, cols_}); diff --git a/tensorflow_io/text/BUILD b/tensorflow_io/text/BUILD index b7b7f0ab0..3920447a5 100644 --- a/tensorflow_io/text/BUILD +++ b/tensorflow_io/text/BUILD @@ -5,6 +5,7 @@ package(default_visibility = ["//visibility:public"]) cc_binary( name = "python/ops/_text_ops.so", srcs = [ + "kernels/text_input.cc", "kernels/text_sequence.cc", "ops/text_ops.cc", ], @@ -18,7 +19,9 @@ cc_binary( ], linkshared = 1, deps = [ + "//tensorflow_io/core:dataset_ops", "//tensorflow_io/core:sequence_ops", + "@libarchive", "@local_config_tf//:libtensorflow_framework", "@local_config_tf//:tf_header_lib", ], diff --git a/tensorflow_io/text/__init__.py b/tensorflow_io/text/__init__.py index e5f7ee9ad..65d8a33da 100644 --- a/tensorflow_io/text/__init__.py +++ b/tensorflow_io/text/__init__.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""TextOutputSequence +"""TextInput/TextOutput @@TextOutputSequence +@@TextDataset """ from __future__ import absolute_import @@ -22,11 +23,13 @@ from __future__ import print_function from tensorflow_io.text.python.ops.text_ops import TextOutputSequence +from tensorflow_io.text.python.ops.text_ops import TextDataset from tensorflow.python.util.all_util import remove_undocumented _allowed_symbols = [ "TextOutputSequence", + "TextDataset", ] remove_undocumented(__name__) diff --git a/tensorflow_io/text/kernels/text_input.cc b/tensorflow_io/text/kernels/text_input.cc new file mode 100644 index 000000000..31ed20205 --- /dev/null +++ b/tensorflow_io/text/kernels/text_input.cc @@ -0,0 +1,67 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "kernels/dataset_ops.h" +#include "tensorflow/core/lib/io/buffered_inputstream.h" + +namespace tensorflow { +namespace data { + +class TextInput: public DataInput { + public: + Status ReadRecord(io::InputStreamInterface& s, IteratorContext* ctx, std::unique_ptr& state, int64* returned, std::vector* out_tensors) const override { + if (state.get() == nullptr) { + state.reset(new io::BufferedInputStream(&s, 4096)); + } + string buffer; + Status status = state.get()->ReadLine(&buffer); + if (!(status.ok() || errors::IsOutOfRange(status))) { + return status; + } + *returned = (status.ok()) ? 1 : 0; + if (*returned == 1) { + Tensor value_tensor(ctx->allocator({}), DT_STRING, {}); + value_tensor.scalar()() = buffer; + out_tensors->emplace_back(std::move(value_tensor)); + } + return Status::OK(); + } + Status FromStream(io::InputStreamInterface& s) override { + // TODO: Read 4K buffer to detect BOM. + //string header; + //TF_RETURN_IF_ERROR(s.ReadNBytes(4096, &header)); + //for (size i = 0; i < header.size(); i++) { + // if (!isprint(header[i])) { + // return errors::InvalidArgument("text file contains character that is non printable at ", i); + // } + //} + return Status::OK(); + } + void EncodeAttributes(VariantTensorData* data) const override { + } + bool DecodeAttributes(const VariantTensorData& data) override { + return true; + } + protected: +}; + +REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TextInput, "tensorflow::data::TextInput"); + +REGISTER_KERNEL_BUILDER(Name("TextInput").Device(DEVICE_CPU), + DataInputOp); +REGISTER_KERNEL_BUILDER(Name("TextDataset").Device(DEVICE_CPU), + InputDatasetOp); +} // namespace data +} // namespace tensorflow diff --git a/tensorflow_io/text/ops/text_ops.cc b/tensorflow_io/text/ops/text_ops.cc index af071f6f7..c291a420b 100644 --- a/tensorflow_io/text/ops/text_ops.cc +++ b/tensorflow_io/text/ops/text_ops.cc @@ -19,6 +19,27 @@ limitations under the License. namespace tensorflow { +REGISTER_OP("TextInput") + .Input("source: string") + .Output("handle: variant") + .Attr("filters: list(string) = []") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->MakeShape({c->UnknownDim()})); + return Status::OK(); + }); + +REGISTER_OP("TextDataset") + .Input("input: T") + .Output("handle: variant") + .Attr("output_types: list(type) >= 1") + .Attr("output_shapes: list(shape) >= 1") + .Attr("T: {string, variant} = DT_VARIANT") + .SetIsStateful() + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->MakeShape({})); + return Status::OK(); + }); + REGISTER_OP("TextOutputSequence") .Input("destination: string") .Output("sequence: resource") diff --git a/tensorflow_io/text/python/ops/text_ops.py b/tensorflow_io/text/python/ops/text_ops.py index 740b6cf7b..d29e524ec 100644 --- a/tensorflow_io/text/python/ops/text_ops.py +++ b/tensorflow_io/text/python/ops/text_ops.py @@ -12,14 +12,50 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""TextOutputSequence.""" +"""TextInput/TextOutput.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function +import tensorflow +from tensorflow import dtypes +from tensorflow.compat.v1 import data from tensorflow_io import _load_library text_ops = _load_library('_text_ops.so') +class TextDataset(data.Dataset): + """A Text Dataset + """ + + def __init__(self, filename): + """Create a Text Reader. + + Args: + filename: A `tf.string` tensor containing one or more filenames. + """ + self._data_input = text_ops.text_input(filename, ["none", "gz"]) + super(TextDataset, self).__init__() + + def _inputs(self): + return [] + + def _as_variant_tensor(self): + return text_ops.text_dataset( + self._data_input, + output_types=self.output_types, + output_shapes=self.output_shapes) + + @property + def output_shapes(self): + return tuple([tensorflow.TensorShape([])]) + + @property + def output_classes(self): + return tensorflow.Tensor + + @property + def output_types(self): + return tuple([dtypes.string]) class TextOutputSequence(object): """TextOutputSequence""" diff --git a/tests/test_text.py b/tests/test_text.py new file mode 100644 index 000000000..d3aa805a1 --- /dev/null +++ b/tests/test_text.py @@ -0,0 +1,60 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +"""Tests for Text Input.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pytest +import tensorflow +tensorflow.compat.v1.disable_eager_execution() + +from tensorflow import errors # pylint: disable=wrong-import-position +import tensorflow_io.text as text_io # pylint: disable=wrong-import-position + +def test_text_input(): + """test_text_input + """ + text_filename = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_text", "lorem.txt") + with open(text_filename, 'rb') as f: + lines = [line.strip() for line in f] + text_filename = "file://" + text_filename + + gzip_text_filename = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_text", "lorem.txt.gz") + gzip_text_filename = "file://" + gzip_text_filename + + num_repeats = 2 + + filenames = [text_filename, gzip_text_filename] + dataset = text_io.TextDataset(filenames).repeat(num_repeats) + iterator = dataset.make_initializable_iterator() + init_op = iterator.initializer + get_next = iterator.get_next() + with tensorflow.compat.v1.Session() as sess: + sess.run(init_op) + for _ in range(num_repeats): + for _ in filenames: + for i in lines: + v = sess.run(get_next) + assert i == v + with pytest.raises(errors.OutOfRangeError): + sess.run(get_next) + +if __name__ == "__main__": + test.main() diff --git a/tests/test_text/lorem.txt b/tests/test_text/lorem.txt new file mode 100644 index 000000000..265551207 --- /dev/null +++ b/tests/test_text/lorem.txt @@ -0,0 +1,49 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque in suscipit ex. Praesent ligula nisl, fringilla id nibh eu, placerat pretium dolor. Aenean nisl mauris, euismod vitae laoreet eget, laoreet non tortor. Fusce aliquam lectus a lobortis bibendum. Phasellus id quam nisi. Ut vulputate urna neque. Suspendisse eu nisi lorem. Donec ex mi, sollicitudin quis malesuada eget, ullamcorper eget purus. Mauris quis elit et erat ullamcorper rhoncus. Cras enim arcu, ultrices vel malesuada et, placerat id nunc. Aliquam rhoncus molestie nisi, ac feugiat dolor placerat quis. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. + +Cras vel sapien id augue mattis sodales. Praesent sodales nunc vel imperdiet consectetur. Aenean blandit lacinia lacinia. Praesent congue pulvinar ipsum convallis imperdiet. Nulla sodales dui nunc, mattis mollis velit ornare at. Suspendisse id viverra diam. Duis eget nulla ultricies augue aliquam sodales id sit amet magna. Vivamus auctor metus ac neque molestie, interdum consequat quam cursus. Etiam aliquet sagittis elit eget pulvinar. Suspendisse potenti. Etiam et egestas mauris. Praesent eu maximus dui, in suscipit lorem. Integer commodo purus ac quam lobortis dapibus. + +Sed non ante cursus, porttitor dui quis, cursus eros. Curabitur a enim augue. Vivamus venenatis facilisis velit ac ultrices. Nunc feugiat consectetur nunc, vel posuere orci sagittis ut. Proin purus tortor, laoreet et pharetra nec, cursus eget nunc. Vestibulum sodales ligula feugiat, imperdiet massa in, faucibus magna. In hac habitasse platea dictumst. Cras sit amet scelerisque leo, ut ornare erat. Vestibulum volutpat eros elit. Cras facilisis eu sapien ut elementum. Duis placerat nunc a molestie porttitor. + +Cras dapibus interdum mi et maximus. Integer mollis lectus et ipsum porttitor accumsan. Integer ac ultricies purus. Ut iaculis egestas eros at hendrerit. Pellentesque velit sem, suscipit ac odio eget, egestas laoreet dolor. Nunc sed tellus enim. Donec aliquam, sapien eu lacinia maximus, magna ligula tempor neque, ullamcorper varius ipsum risus a ipsum. Pellentesque tempus blandit neque in malesuada. Phasellus sodales enim ante, sed volutpat felis rhoncus luctus. Phasellus et varius ex. + +Morbi elementum, massa a efficitur semper, orci nisl cursus ipsum, sit amet convallis magna risus et odio. Donec lobortis tellus ante, vel tempor elit pulvinar vitae. Integer ultrices ultricies metus vestibulum ultricies. Fusce pellentesque quam quis nulla tincidunt tincidunt. Mauris commodo, ligula eget venenatis blandit, ex mauris mollis diam, a feugiat velit leo eget risus. Curabitur non odio a nisi convallis molestie. Fusce posuere quis arcu vitae vestibulum. Sed diam enim, aliquam nec neque vitae, volutpat pellentesque est. Vivamus quis finibus lacus. Curabitur consequat varius vestibulum. + +Proin bibendum sem ac aliquet interdum. Proin dolor augue, vestibulum eget tincidunt id, convallis id augue. Aenean nec pretium est. Sed consectetur sagittis leo, in faucibus turpis elementum vitae. Cras non lacus tellus. Etiam lorem elit, posuere in orci at, malesuada aliquet tellus. Proin vel lorem purus. Vestibulum a enim sit amet mi vehicula viverra. Vivamus eget sapien eu purus molestie fringilla id nec est. + +Mauris aliquet, urna a porttitor fringilla, eros risus facilisis nulla, sed dapibus mauris magna vel metus. In posuere porta tellus, a auctor purus sagittis sodales. Nulla pharetra quis libero eu elementum. Ut tempor pulvinar pharetra. Mauris diam ipsum, egestas ut vulputate convallis, dictum sed nisi. Donec tempor auctor urna, in aliquet mauris tincidunt dapibus. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. + +Vestibulum sit amet efficitur est. Integer sed dui suscipit, maximus lacus ac, pretium nisl. Vestibulum tempus, nunc at interdum mollis, nisi libero malesuada quam, sit amet mattis justo felis at nisi. Sed laoreet, diam ac consequat dignissim, dolor erat cursus ex, et eleifend tortor tortor ac odio. Pellentesque neque eros, pellentesque sit amet ligula ac, egestas finibus felis. Sed vulputate porttitor erat, in vestibulum diam fringilla quis. Vivamus imperdiet odio eget justo suscipit commodo. Vestibulum aliquet condimentum lacus, et pharetra justo. + +Vestibulum sagittis placerat metus, quis luctus lectus consequat vitae. Aliquam mattis, nisl sed auctor finibus, arcu nisl pretium orci, ut eleifend mauris felis nec tellus. Vivamus egestas, urna at lacinia tempus, diam turpis feugiat magna, in mollis odio urna ut tellus. Donec accumsan sem vitae luctus sagittis. Cras vulputate libero eu leo efficitur iaculis. Donec mauris velit, euismod sit amet suscipit vitae, semper vitae libero. Aliquam aliquet, metus vitae pretium efficitur, nisi magna pellentesque ligula, nec convallis libero leo eget est. Duis pharetra eleifend sollicitudin. Proin lacus magna, euismod ac mollis id, dignissim quis eros. Morbi pretium justo sed elit pretium elementum. Nullam et tellus ut metus ornare maximus. Duis sit amet magna mi. Vestibulum in tincidunt dolor, tempus consequat magna. Nulla sollicitudin dolor a ultricies vulputate. In commodo turpis non eleifend ultricies. Morbi tempor efficitur nulla sit amet aliquam. + +Donec pretium metus in hendrerit mollis. Nulla dictum suscipit quam. Maecenas et tempus libero, sit amet viverra neque. Duis efficitur, sem ac placerat porta, diam leo dapibus nunc, vitae accumsan lectus nulla sed erat. Vivamus vestibulum est ac elit scelerisque egestas. Curabitur suscipit lectus vel tellus ultricies, in eleifend libero blandit. Donec nec nisi sapien. Aliquam feugiat, sapien ac porttitor efficitur, velit nisi suscipit est, a efficitur dui nisi id ante. Morbi id justo venenatis, vehicula velit sed, dictum felis. + +Curabitur ultricies bibendum velit in efficitur. In dignissim quis nisl eget gravida. Pellentesque varius eros elit, eu auctor quam fermentum sed. Vestibulum mollis scelerisque tellus, a imperdiet diam tempus sit amet. Aenean molestie lectus eu felis tempor aliquam. Vivamus ac sapien tristique quam ultrices volutpat. Curabitur semper nibh eget massa vehicula rutrum. Pellentesque tempor ligula ac arcu posuere, vel accumsan leo vehicula. + +Donec vitae scelerisque nisl. Quisque quis porttitor dolor, sed tincidunt lorem. Phasellus in dignissim turpis. Aliquam laoreet libero quis lectus cursus, efficitur euismod lorem bibendum. Proin et turpis mi. In in tellus ut ligula commodo venenatis eget eget sapien. Sed diam enim, suscipit ut nunc ut, sagittis iaculis nulla. In eleifend, diam eget malesuada pretium, sapien lectus luctus elit, quis tempus mauris metus sodales odio. + +Praesent convallis in dui ut vehicula. Phasellus sed volutpat est. Quisque at purus at magna vestibulum sollicitudin. Praesent scelerisque tellus id nibh facilisis, ut luctus lorem euismod. Cras porttitor suscipit varius. Nunc at ipsum purus. In hac habitasse platea dictumst. Proin eget tortor a odio luctus vestibulum. Pellentesque eget molestie mauris. Sed porta luctus pretium. Donec egestas nibh quis congue aliquet. Integer mollis lorem neque, sit amet volutpat diam tincidunt eu. Morbi vitae dolor tristique, interdum sem eu, semper neque. Integer non metus id nibh auctor commodo vel ut quam. Vivamus non tempor justo. + +Ut ut facilisis arcu. Maecenas fermentum accumsan leo, eu porttitor quam suscipit in. Maecenas eget maximus tortor. In et ipsum eget risus volutpat viverra. Quisque eget ipsum nec nibh pretium venenatis ut at magna. Vivamus molestie ipsum id erat venenatis, id porttitor nisi finibus. Nam nec tortor eget tortor tempor faucibus. Mauris accumsan elit vitae mauris consectetur rutrum. + +Suspendisse vel elit sit amet libero dignissim aliquam ut sed sapien. Aenean imperdiet, nisi pharetra pharetra feugiat, orci quam cursus est, nec dignissim metus lacus id nibh. Nam nulla libero, condimentum ut volutpat ut, auctor eget erat. Vivamus eu bibendum dui. Sed varius nulla sapien, nec laoreet orci ultrices et. In vulputate sagittis nisl eu semper. Sed gravida magna sit amet efficitur porttitor. Etiam nisl ligula, aliquet sit amet accumsan at, rhoncus in turpis. Nam suscipit quam est, quis aliquet eros pharetra interdum. Quisque accumsan dolor ut dui mollis, tristique tristique elit feugiat. Etiam gravida aliquam elit, semper ullamcorper neque consectetur quis. Phasellus ac volutpat nibh. + +Duis pulvinar dapibus sapien et ultrices. Sed at mi sem. Fusce tincidunt sed lacus imperdiet laoreet. Vivamus a cursus tortor, quis commodo libero. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean quam neque, congue nec est a, viverra viverra dolor. Nulla tempor odio nibh. Nam ut ligula odio. Nam tellus arcu, luctus ac placerat ac, consectetur eget justo. Vivamus sagittis nulla dui, vitae cursus ex commodo in. + +Vivamus nisl magna, pretium sit amet nulla sit amet, ultrices pharetra nibh. Proin sapien tellus, malesuada vel aliquam eget, ultricies vitae nisi. Suspendisse iaculis, massa vel feugiat luctus, elit dui semper mauris, quis finibus diam metus eu libero. Phasellus sed congue erat, congue laoreet ligula. Sed a pulvinar lorem. Donec aliquam pretium rutrum. Praesent in elit rhoncus, egestas dui et, sodales sapien. Suspendisse varius accumsan ante a pellentesque. In vehicula risus ut venenatis rutrum. Aenean sed nisl elementum dolor aliquet luctus. + +Proin erat lacus, pellentesque vitae tortor a, auctor condimentum lectus. Maecenas eget vehicula eros, sed semper est. Phasellus sed feugiat metus, rhoncus semper est. Ut id elit a sem egestas fermentum id interdum metus. Nam ac porta urna. Sed posuere purus vel risus convallis hendrerit. Pellentesque est turpis, malesuada vitae arcu in, ultrices tincidunt sem. Etiam leo tortor, aliquet nec consequat et, cursus nec tortor. Nunc sit amet purus ultricies magna scelerisque euismod. Praesent dictum odio neque, ac semper lectus laoreet in. In semper nulla vel augue semper eleifend. Aenean et maximus risus. Proin vestibulum odio non nibh interdum sollicitudin. + +Duis suscipit enim et nisl pharetra, et venenatis lorem sagittis. Suspendisse dui tellus, condimentum imperdiet elementum vitae, pellentesque vitae erat. Phasellus consequat varius est, eget scelerisque est congue vel. Cras ornare sollicitudin massa, non aliquet ipsum mollis at. Aliquam tempor fermentum ex eget egestas. Phasellus quis risus vitae magna eleifend luctus ut quis turpis. In egestas ornare est. + +Donec faucibus convallis ante, ullamcorper sollicitudin urna rutrum sit amet. Mauris id dolor efficitur, faucibus velit nec, laoreet ante. Pellentesque malesuada quam eleifend leo interdum egestas. Aliquam sit amet nibh iaculis, tristique ex a, semper dolor. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi iaculis ex consequat, laoreet nunc aliquam, dictum diam. Quisque eu interdum purus. Nam venenatis nulla sit amet neque cursus, sit amet pharetra urna finibus. Morbi tristique purus enim. Nam sed ornare elit, ut ultrices nisi. Cras in commodo tellus, non suscipit ex. Nulla facilisi. Nunc finibus nulla eu felis maximus, nec dapibus lacus aliquet. + +Aenean cursus fringilla eros sit amet fermentum. Pellentesque dictum nulla nec mauris porta, sed blandit nisi vestibulum. Curabitur ut magna lobortis, venenatis leo at, tempor orci. Aliquam vitae libero ac sem eleifend cursus. Integer ac quam id nunc aliquam pellentesque at non metus. Duis nibh sem, pulvinar non suscipit ut, suscipit eu ex. Proin bibendum porta mauris. Phasellus eu nisl id risus suscipit commodo vel ut tortor. Morbi lacinia, turpis vitae fringilla hendrerit, tellus ante semper nibh, cursus rhoncus turpis neque in nisl. Suspendisse faucibus eget odio sit amet vestibulum. + +Sed congue mi ac arcu fringilla imperdiet. Aliquam rhoncus eget neque et lobortis. Pellentesque vitae ante posuere libero sollicitudin aliquet. Morbi porttitor aliquet eros vel pulvinar. Praesent ante enim, suscipit quis finibus sit amet, varius a ligula. Praesent sodales, eros facilisis porta ultricies, sem sem scelerisque quam, sed pellentesque eros felis in nisl. Aenean ut lacus ut magna varius faucibus. + +Proin id ipsum sed augue fermentum tempus. Donec dapibus aliquet lacus sed mollis. Aliquam id tortor finibus, tempor nunc posuere, feugiat ligula. In sollicitudin ante eget interdum condimentum. Etiam porttitor nunc quis venenatis molestie. Cras sed sem nec sem bibendum volutpat. Aenean eu est non lorem dapibus pellentesque. Nunc pellentesque et enim non luctus. Integer sed quam augue. Nam interdum tincidunt auctor. Vestibulum luctus nibh nisl, nec aliquet enim ullamcorper vitae. Morbi suscipit eleifend pulvinar. Pellentesque id consequat mi. Duis fermentum, felis vitae commodo suscipit, lacus felis dignissim nunc, accumsan semper metus velit feugiat leo. + +Sed rutrum odio nec massa tincidunt condimentum. Vestibulum varius dui dolor, quis tincidunt est rutrum sit amet. Integer quis purus porta, consectetur leo vel, posuere justo. Mauris semper tortor a leo faucibus, ac vulputate elit fringilla. Ut quis orci interdum, elementum metus et, mattis tellus. Ut venenatis ut augue vulputate mollis. Mauris a odio et massa blandit porttitor et in dui. Suspendisse potenti. Nunc maximus ex a mattis luctus. Integer felis ipsum, rhoncus id ex at, interdum suscipit magna. Nam dapibus vehicula sollicitudin. Nulla facilisi. Vivamus in massa blandit, luctus purus eget, feugiat diam. Vivamus euismod est at arcu iaculis, in feugiat libero pharetra. Proin sagittis, orci ut egestas malesuada, leo odio tincidunt dui, nec sollicitudin quam dolor auctor nunc. Pellentesque scelerisque metus id lacinia auctor. + +Integer eget lacinia ligula. Donec id odio ornare, condimentum est at, commodo erat. Nullam dignissim purus non leo sagittis, et tempus felis elementum. Donec odio odio, mollis a risus vel, hendrerit mattis ex. Proin varius a dui feugiat vestibulum. Nulla facilisi. Nulla erat quam, pulvinar id euismod eu, tristique ac urna. Mauris condimentum nunc eget eros suscipit aliquam. Aliquam dolor felis, lobortis non porta at, vestibulum eu lorem. Integer justo quam, tristique eu pulvinar lacinia, scelerisque eu turpis. Suspendisse tincidunt magna nisl, nec maximus lorem congue vel. Suspendisse potenti. Suspendisse sagittis tortor id odio vehicula, ut pharetra diam finibus. diff --git a/tests/test_text/lorem.txt.gz b/tests/test_text/lorem.txt.gz new file mode 100644 index 000000000..47df8f7ff Binary files /dev/null and b/tests/test_text/lorem.txt.gz differ