Support explicitly setting a dimension order with to_dataframe() (#4333)

Thomas-Z · tzilio · web-flow · commit 1f45bcaecd06 · 2020-08-14T18:28:25.000Z
* #4331: Adding dim_order parameter to Dataset.to_dataframe * #4331: Typo * #4331: Adding dim_order parameter to DataArray.to_dataframe. Refactoring some code, fixing some docstring. * #4331: Updating whats-new.rst * #4331: Updating whats-new.rst (bis) Co-authored-by: tzilio <tzilio@groupcls.com>
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -45,6 +45,10 @@ New Features
   of :py:class:`DataArray` and :py:class:`Dataset` objects and
   document the new method in :doc:`internals`. (:pull:`4248`).
   By `Justus Magin <https://github.com/keewis>`_.
+- :py:meth:`~xarray.DataArray.to_dataframe` and :py:meth:`~xarray.Dataset.to_dataframe`
+  now accept a ``dim_order`` parameter allowing to specify the resulting dataframe's
+  dimensions order (:issue:`4331`, :pull:`4333`).
+  By `Thomas Zilio <https://github.com/thomas-z>`_.
 
 
 Bug fixes
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -2365,13 +2365,36 @@ def to_pandas(self) -> Union["DataArray", pd.Series, pd.DataFrame]:
         indexes = [self.get_index(dim) for dim in self.dims]
         return constructor(self.values, *indexes)
 
-    def to_dataframe(self, name: Hashable = None) -> pd.DataFrame:
+    def to_dataframe(
+        self, name: Hashable = None, dim_order: List[Hashable] = None
+    ) -> pd.DataFrame:
         """Convert this array and its coordinates into a tidy pandas.DataFrame.
 
         The DataFrame is indexed by the Cartesian product of index coordinates
         (in the form of a :py:class:`pandas.MultiIndex`).
 
         Other coordinates are included as columns in the DataFrame.
+
+        Parameters
+        ----------
+        name
+            Name to give to this array (required if unnamed).
+        dim_order
+            Hierarchical dimension order for the resulting dataframe.
+            Array content is transposed to this order and then written out as flat
+            vectors in contiguous order, so the last dimension in this list
+            will be contiguous in the resulting DataFrame. This has a major
+            influence on which operations are efficient on the resulting
+            dataframe.
+
+            If provided, must include all dimensions of this DataArray. By default,
+            dimensions are sorted according to the DataArray dimensions order.
+
+        Returns
+        -------
+        result
+            DataArray as a pandas DataFrame.
+
         """
         if name is None:
             name = self.name
@@ -2381,15 +2404,20 @@ def to_dataframe(self, name: Hashable = None) -> pd.DataFrame:
                 "DataFrame: use the ``name`` parameter"
             )
 
-        dims = dict(zip(self.dims, self.shape))
         # By using a unique name, we can convert a DataArray into a DataFrame
         # even if it shares a name with one of its coordinates.
         # I would normally use unique_name = object() but that results in a
         # dataframe with columns in the wrong order, for reasons I have not
         # been able to debug (possibly a pandas bug?).
         unique_name = "__unique_name_identifier_z98xfz98xugfg73ho__"
         ds = self._to_dataset_whole(name=unique_name)
-        df = ds._to_dataframe(dims)
+
+        if dim_order is None:
+            ordered_dims = dict(zip(self.dims, self.shape))
+        else:
+            ordered_dims = ds._normalize_dim_order(dim_order=dim_order)
+
+        df = ds._to_dataframe(ordered_dims)
         df.columns = [name if c == unique_name else c for c in df.columns]
         return df
 
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -4524,23 +4524,75 @@ def to_array(self, dim="variable", name=None):
             data, coords, dims, attrs=self.attrs, name=name, indexes=indexes
         )
 
-    def _to_dataframe(self, ordered_dims):
+    def _normalize_dim_order(
+        self, dim_order: List[Hashable] = None
+    ) -> Dict[Hashable, int]:
+        """
+        Check the validity of the provided dimensions if any and return the mapping
+        between dimension name and their size.
+
+        Parameters
+        ----------
+        dim_order
+            Dimension order to validate (default to the alphabetical order if None).
+
+        Returns
+        -------
+        result
+            Validated dimensions mapping.
+
+        """
+        if dim_order is None:
+            dim_order = list(self.dims)
+        elif set(dim_order) != set(self.dims):
+            raise ValueError(
+                "dim_order {} does not match the set of dimensions of this "
+                "Dataset: {}".format(dim_order, list(self.dims))
+            )
+
+        ordered_dims = {k: self.dims[k] for k in dim_order}
+
+        return ordered_dims
+
+    def _to_dataframe(self, ordered_dims: Mapping[Hashable, int]):
         columns = [k for k in self.variables if k not in self.dims]
         data = [
             self._variables[k].set_dims(ordered_dims).values.reshape(-1)
             for k in columns
         ]
-        index = self.coords.to_index(ordered_dims)
+        index = self.coords.to_index([*ordered_dims])
         return pd.DataFrame(dict(zip(columns, data)), index=index)
 
-    def to_dataframe(self):
+    def to_dataframe(self, dim_order: List[Hashable] = None) -> pd.DataFrame:
         """Convert this dataset into a pandas.DataFrame.
 
         Non-index variables in this dataset form the columns of the
-        DataFrame. The DataFrame is be indexed by the Cartesian product of
+        DataFrame. The DataFrame is indexed by the Cartesian product of
         this dataset's indices.
+
+        Parameters
+        ----------
+        dim_order
+            Hierarchical dimension order for the resulting dataframe. All
+            arrays are transposed to this order and then written out as flat
+            vectors in contiguous order, so the last dimension in this list
+            will be contiguous in the resulting DataFrame. This has a major
+            influence on which operations are efficient on the resulting
+            dataframe.
+
+            If provided, must include all dimensions of this dataset. By
+            default, dimensions are sorted alphabetically.
+
+        Returns
+        -------
+        result
+            Dataset as a pandas DataFrame.
+
         """
-        return self._to_dataframe(self.dims)
+
+        ordered_dims = self._normalize_dim_order(dim_order=dim_order)
+
+        return self._to_dataframe(ordered_dims=ordered_dims)
 
     def _set_sparse_data_from_dataframe(
         self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple
@@ -4694,11 +4746,11 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
             influence on which operations are efficient on the resulting dask
             dataframe.
 
-            If provided, must include all dimensions on this dataset. By
+            If provided, must include all dimensions of this dataset. By
             default, dimensions are sorted alphabetically.
         set_index : bool, optional
             If set_index=True, the dask DataFrame is indexed by this dataset's
-            coordinate. Since dask DataFrames to not support multi-indexes,
+            coordinate. Since dask DataFrames do not support multi-indexes,
             set_index only works if the dataset only contains one dimension.
 
         Returns
@@ -4709,15 +4761,7 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
         import dask.array as da
         import dask.dataframe as dd
 
-        if dim_order is None:
-            dim_order = list(self.dims)
-        elif set(dim_order) != set(self.dims):
-            raise ValueError(
-                "dim_order {} does not match the set of dimensions on this "
-                "Dataset: {}".format(dim_order, list(self.dims))
-            )
-
-        ordered_dims = {k: self.dims[k] for k in dim_order}
+        ordered_dims = self._normalize_dim_order(dim_order=dim_order)
 
         columns = list(ordered_dims)
         columns.extend(k for k in self.coords if k not in self.dims)
@@ -4744,6 +4788,8 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
         df = dd.concat(series_list, axis=1)
 
         if set_index:
+            dim_order = [*ordered_dims]
+
             if len(dim_order) == 1:
                 (dim,) = dim_order
                 df = df.set_index(dim)
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -3463,15 +3463,18 @@ def test_to_pandas(self):
 
     def test_to_dataframe(self):
         # regression test for #260
-        arr = DataArray(
-            np.random.randn(3, 4), [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo"
-        )
+        arr_np = np.random.randn(3, 4)
+
+        arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo")
         expected = arr.to_series()
         actual = arr.to_dataframe()["foo"]
         assert_array_equal(expected.values, actual.values)
         assert_array_equal(expected.name, actual.name)
         assert_array_equal(expected.index.values, actual.index.values)
 
+        actual = arr.to_dataframe(dim_order=["A", "B"])["foo"]
+        assert_array_equal(arr_np.transpose().reshape(-1), actual.values)
+
         # regression test for coords with different dimensions
         arr.coords["C"] = ("B", [-1, -2, -3])
         expected = arr.to_series().to_frame()
@@ -3482,6 +3485,9 @@ def test_to_dataframe(self):
         assert_array_equal(expected.columns.values, actual.columns.values)
         assert_array_equal(expected.index.values, actual.index.values)
 
+        with pytest.raises(ValueError, match="does not match the set of dimensions"):
+            arr.to_dataframe(dim_order=["B", "A", "C"])
+
         arr.name = None  # unnamed
         with raises_regex(ValueError, "unnamed"):
             arr.to_dataframe()
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -3939,6 +3939,33 @@ def test_to_and_from_dataframe(self):
         # check roundtrip
         assert_identical(ds.assign_coords(x=[0, 1]), Dataset.from_dataframe(actual))
 
+        # Check multiindex reordering
+        new_order = ["x", "y"]
+        actual = ds.to_dataframe(dim_order=new_order)
+        assert expected.equals(actual)
+
+        new_order = ["y", "x"]
+        exp_index = pd.MultiIndex.from_arrays(
+            [["a", "a", "b", "b", "c", "c"], [0, 1, 0, 1, 0, 1]], names=["y", "x"]
+        )
+        expected = pd.DataFrame(
+            w.transpose().reshape(-1), columns=["w"], index=exp_index
+        )
+        actual = ds.to_dataframe(dim_order=new_order)
+        assert expected.equals(actual)
+
+        invalid_order = ["x"]
+        with pytest.raises(
+            ValueError, match="does not match the set of dimensions of this"
+        ):
+            ds.to_dataframe(dim_order=invalid_order)
+
+        invalid_order = ["x", "z"]
+        with pytest.raises(
+            ValueError, match="does not match the set of dimensions of this"
+        ):
+            ds.to_dataframe(dim_order=invalid_order)
+
         # check pathological cases
         df = pd.DataFrame([1])
         actual = Dataset.from_dataframe(df)