More changes for BigQuery connector (#490)

vlasenkoalexey · yongtang · commit 55d97508c281 · 2019-09-18T17:41:22.000-07:00
* Fixing Dockerfile

* Returning dataset in a form of Dictionary from BigQuery connector

* Adding NULL fields support to BigQuery connector

* python style tweak

* more style tweaks

* Style tweaks, comming from google account
diff --git a/tensorflow_io/bigquery/kernels/bigquery_lib.h b/tensorflow_io/bigquery/kernels/bigquery_lib.h
@@ -184,6 +184,9 @@ class BigQueryReaderDatasetIterator : public DatasetIterator<Dataset> {
         case avro::AVRO_ENUM:
           dtype = DT_STRING;
           break;
+        case avro::AVRO_NULL:
+          dtype = output_types[i];
+          break;
         default:
           return errors::InvalidArgument("unsupported data type: ",
                                          field.type());
@@ -250,6 +253,8 @@ class BigQueryReaderDatasetIterator : public DatasetIterator<Dataset> {
           ((*out_tensors)[i]).scalar<string>()() =
               field.value<avro::GenericEnum>().symbol();
           break;
+        case avro::AVRO_NULL:  // Fallthrough;
+          break;
         default:
           return errors::InvalidArgument("unsupported data type: ",
                                          field.type());
diff --git a/tensorflow_io/bigquery/python/ops/bigquery_api.py b/tensorflow_io/bigquery/python/ops/bigquery_api.py
@@ -27,6 +27,9 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
+from operator import itemgetter
+
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
@@ -223,8 +226,19 @@ class _BigQueryDataset(dataset_ops.DatasetSource):
 
   def __init__(self, client_resource, selected_fields, output_types,
                avro_schema, stream):
-    self._element_spec = tuple(
-        tensor_spec.TensorSpec([], dtype) for dtype in output_types)
+
+    # selected_fields and corresponding output_types have to be sorted because
+    # of b/141251314
+    sorted_fields_with_types = sorted(
+        zip(selected_fields, output_types),
+        key=itemgetter(0))
+    selected_fields, output_types = list(zip(*sorted_fields_with_types))
+    selected_fields = list(selected_fields)
+    output_types = list(output_types)
+
+    self._element_spec = collections.OrderedDict(zip(
+        selected_fields,
+        (tensor_spec.TensorSpec([], dtype) for dtype in output_types)))
 
     variant_tensor = _bigquery_so.big_query_dataset(
         client=client_resource,
diff --git a/tools/dev/Dockerfile b/tools/dev/Dockerfile
@@ -49,7 +49,7 @@ RUN /bin/bash -c "source activate tfio-dev && python -m pip install \
     pyarrow==${ARROW_VERSION} \
     pandas \
     fastavro \
-    gast==0.2.2
+    gast==0.2.2 \
     ${PIP_ADD_PACKAGES} \
     "
 

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ RUN /bin/bash -c "source activate tfio-dev && python -m pip install \`
`49`	`49`	`pyarrow==${ARROW_VERSION} \`
`50`	`50`	`pandas \`
`51`	`51`	`fastavro \`
`52`		`- gast==0.2.2`
	`52`	`+ gast==0.2.2 \`
`53`	`53`	`${PIP_ADD_PACKAGES} \`
`54`	`54`	`"`
`55`	`55`