|
| 1 | +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. |
| 14 | +==============================================================================*/ |
| 15 | + |
| 16 | +#include "tensorflow/core/framework/op_kernel.h" |
| 17 | +#include "tensorflow_io/arrow/kernels/arrow_kernels.h" |
| 18 | +#include "arrow/io/api.h" |
| 19 | +#include "arrow/ipc/feather.h" |
| 20 | +#include "arrow/ipc/feather_generated.h" |
| 21 | +#include "arrow/buffer.h" |
| 22 | + |
| 23 | +namespace tensorflow { |
| 24 | +namespace data { |
| 25 | +namespace { |
| 26 | + |
| 27 | +class ListFeatherColumnsOp : public OpKernel { |
| 28 | + public: |
| 29 | + explicit ListFeatherColumnsOp(OpKernelConstruction* context) : OpKernel(context) { |
| 30 | + env_ = context->env(); |
| 31 | + } |
| 32 | + |
| 33 | + void Compute(OpKernelContext* context) override { |
| 34 | + const Tensor& filename_tensor = context->input(0); |
| 35 | + const string filename = filename_tensor.scalar<string>()(); |
| 36 | + |
| 37 | + const Tensor& memory_tensor = context->input(1); |
| 38 | + const string& memory = memory_tensor.scalar<string>()(); |
| 39 | + std::unique_ptr<SizedRandomAccessFile> file(new SizedRandomAccessFile(env_, filename, memory.data(), memory.size())); |
| 40 | + uint64 size; |
| 41 | + OP_REQUIRES_OK(context, file->GetFileSize(&size)); |
| 42 | + |
| 43 | + // FEA1.....[metadata][uint32 metadata_length]FEA1 |
| 44 | + static constexpr const char* kFeatherMagicBytes = "FEA1"; |
| 45 | + |
| 46 | + size_t header_length = strlen(kFeatherMagicBytes); |
| 47 | + size_t footer_length = sizeof(uint32) + strlen(kFeatherMagicBytes); |
| 48 | + |
| 49 | + string buffer; |
| 50 | + buffer.resize(header_length > footer_length ? header_length : footer_length); |
| 51 | + |
| 52 | + StringPiece result; |
| 53 | + |
| 54 | + OP_REQUIRES_OK(context, file->Read(0, header_length, &result, &buffer[0])); |
| 55 | + OP_REQUIRES(context, !memcmp(buffer.data(), kFeatherMagicBytes, header_length), errors::InvalidArgument("not a feather file")); |
| 56 | + |
| 57 | + OP_REQUIRES_OK(context, file->Read(size - footer_length, footer_length, &result, &buffer[0])); |
| 58 | + OP_REQUIRES(context, !memcmp(buffer.data() + sizeof(uint32), kFeatherMagicBytes, footer_length - sizeof(uint32)), errors::InvalidArgument("incomplete feather file")); |
| 59 | + |
| 60 | + uint32 metadata_length = *reinterpret_cast<const uint32*>(buffer.data()); |
| 61 | + |
| 62 | + buffer.resize(metadata_length); |
| 63 | + |
| 64 | + OP_REQUIRES_OK(context, file->Read(size - footer_length - metadata_length, metadata_length, &result, &buffer[0])); |
| 65 | + |
| 66 | + const ::arrow::ipc::feather::fbs::CTable* table = ::arrow::ipc::feather::fbs::GetCTable(buffer.data()); |
| 67 | + |
| 68 | + OP_REQUIRES(context, (table->version() >= ::arrow::ipc::feather::kFeatherVersion), errors::InvalidArgument("feather file is old: ", table->version(), " vs. ", ::arrow::ipc::feather::kFeatherVersion)); |
| 69 | + |
| 70 | + std::vector<string> columns; |
| 71 | + std::vector<string> dtypes; |
| 72 | + std::vector<int64> counts; |
| 73 | + columns.reserve(table->columns()->size()); |
| 74 | + dtypes.reserve(table->columns()->size()); |
| 75 | + counts.reserve(table->columns()->size()); |
| 76 | + |
| 77 | + for (int64 i = 0; i < table->columns()->size(); i++) { |
| 78 | + DataType dtype = ::tensorflow::DataType::DT_INVALID; |
| 79 | + switch (table->columns()->Get(i)->values()->type()) { |
| 80 | + case ::arrow::ipc::feather::fbs::Type_BOOL: |
| 81 | + dtype = ::tensorflow::DataType::DT_BOOL; |
| 82 | + break; |
| 83 | + case ::arrow::ipc::feather::fbs::Type_INT8: |
| 84 | + dtype = ::tensorflow::DataType::DT_INT8; |
| 85 | + break; |
| 86 | + case ::arrow::ipc::feather::fbs::Type_INT16: |
| 87 | + dtype = ::tensorflow::DataType::DT_INT16; |
| 88 | + break; |
| 89 | + case ::arrow::ipc::feather::fbs::Type_INT32: |
| 90 | + dtype = ::tensorflow::DataType::DT_INT32; |
| 91 | + break; |
| 92 | + case ::arrow::ipc::feather::fbs::Type_INT64: |
| 93 | + dtype = ::tensorflow::DataType::DT_INT64; |
| 94 | + break; |
| 95 | + case ::arrow::ipc::feather::fbs::Type_UINT8: |
| 96 | + dtype = ::tensorflow::DataType::DT_UINT8; |
| 97 | + break; |
| 98 | + case ::arrow::ipc::feather::fbs::Type_UINT16: |
| 99 | + dtype = ::tensorflow::DataType::DT_UINT16; |
| 100 | + break; |
| 101 | + case ::arrow::ipc::feather::fbs::Type_UINT32: |
| 102 | + dtype = ::tensorflow::DataType::DT_UINT32; |
| 103 | + break; |
| 104 | + case ::arrow::ipc::feather::fbs::Type_UINT64: |
| 105 | + dtype = ::tensorflow::DataType::DT_UINT64; |
| 106 | + break; |
| 107 | + case ::arrow::ipc::feather::fbs::Type_FLOAT: |
| 108 | + dtype = ::tensorflow::DataType::DT_FLOAT; |
| 109 | + break; |
| 110 | + case ::arrow::ipc::feather::fbs::Type_DOUBLE: |
| 111 | + dtype = ::tensorflow::DataType::DT_DOUBLE; |
| 112 | + break; |
| 113 | + case ::arrow::ipc::feather::fbs::Type_UTF8: |
| 114 | + case ::arrow::ipc::feather::fbs::Type_BINARY: |
| 115 | + case ::arrow::ipc::feather::fbs::Type_CATEGORY: |
| 116 | + case ::arrow::ipc::feather::fbs::Type_TIMESTAMP: |
| 117 | + case ::arrow::ipc::feather::fbs::Type_DATE: |
| 118 | + case ::arrow::ipc::feather::fbs::Type_TIME: |
| 119 | + // case ::arrow::ipc::feather::fbs::Type_LARGE_UTF8: |
| 120 | + // case ::arrow::ipc::feather::fbs::Type_LARGE_BINARY: |
| 121 | + default: |
| 122 | + break; |
| 123 | + } |
| 124 | + columns.push_back(table->columns()->Get(i)->name()->str()); |
| 125 | + dtypes.push_back(::tensorflow::DataTypeString(dtype)); |
| 126 | + counts.push_back(table->num_rows()); |
| 127 | + } |
| 128 | + |
| 129 | + TensorShape output_shape = filename_tensor.shape(); |
| 130 | + output_shape.AddDim(columns.size()); |
| 131 | + |
| 132 | + Tensor* columns_tensor; |
| 133 | + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &columns_tensor)); |
| 134 | + Tensor* dtypes_tensor; |
| 135 | + OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &dtypes_tensor)); |
| 136 | + |
| 137 | + output_shape.AddDim(1); |
| 138 | + |
| 139 | + Tensor* shapes_tensor; |
| 140 | + OP_REQUIRES_OK(context, context->allocate_output(2, output_shape, &shapes_tensor)); |
| 141 | + |
| 142 | + for (size_t i = 0; i < columns.size(); i++) { |
| 143 | + columns_tensor->flat<string>()(i) = columns[i]; |
| 144 | + dtypes_tensor->flat<string>()(i) = dtypes[i]; |
| 145 | + shapes_tensor->flat<int64>()(i) = counts[i]; |
| 146 | + } |
| 147 | + } |
| 148 | + private: |
| 149 | + mutex mu_; |
| 150 | + Env* env_ GUARDED_BY(mu_); |
| 151 | +}; |
| 152 | + |
| 153 | +REGISTER_KERNEL_BUILDER(Name("ListFeatherColumns").Device(DEVICE_CPU), |
| 154 | + ListFeatherColumnsOp); |
| 155 | + |
| 156 | + |
| 157 | +} // namespace |
| 158 | +} // namespace data |
| 159 | +} // namespace tensorflow |
0 commit comments