@@ -4524,23 +4524,75 @@ def to_array(self, dim="variable", name=None):
4524
4524
data , coords , dims , attrs = self .attrs , name = name , indexes = indexes
4525
4525
)
4526
4526
4527
- def _to_dataframe (self , ordered_dims ):
4527
+ def _normalize_dim_order (
4528
+ self , dim_order : List [Hashable ] = None
4529
+ ) -> Dict [Hashable , int ]:
4530
+ """
4531
+ Check the validity of the provided dimensions if any and return the mapping
4532
+ between dimension name and their size.
4533
+
4534
+ Parameters
4535
+ ----------
4536
+ dim_order
4537
+ Dimension order to validate (default to the alphabetical order if None).
4538
+
4539
+ Returns
4540
+ -------
4541
+ result
4542
+ Validated dimensions mapping.
4543
+
4544
+ """
4545
+ if dim_order is None :
4546
+ dim_order = list (self .dims )
4547
+ elif set (dim_order ) != set (self .dims ):
4548
+ raise ValueError (
4549
+ "dim_order {} does not match the set of dimensions of this "
4550
+ "Dataset: {}" .format (dim_order , list (self .dims ))
4551
+ )
4552
+
4553
+ ordered_dims = {k : self .dims [k ] for k in dim_order }
4554
+
4555
+ return ordered_dims
4556
+
4557
+ def _to_dataframe (self , ordered_dims : Mapping [Hashable , int ]):
4528
4558
columns = [k for k in self .variables if k not in self .dims ]
4529
4559
data = [
4530
4560
self ._variables [k ].set_dims (ordered_dims ).values .reshape (- 1 )
4531
4561
for k in columns
4532
4562
]
4533
- index = self .coords .to_index (ordered_dims )
4563
+ index = self .coords .to_index ([ * ordered_dims ] )
4534
4564
return pd .DataFrame (dict (zip (columns , data )), index = index )
4535
4565
4536
- def to_dataframe (self ) :
4566
+ def to_dataframe (self , dim_order : List [ Hashable ] = None ) -> pd . DataFrame :
4537
4567
"""Convert this dataset into a pandas.DataFrame.
4538
4568
4539
4569
Non-index variables in this dataset form the columns of the
4540
- DataFrame. The DataFrame is be indexed by the Cartesian product of
4570
+ DataFrame. The DataFrame is indexed by the Cartesian product of
4541
4571
this dataset's indices.
4572
+
4573
+ Parameters
4574
+ ----------
4575
+ dim_order
4576
+ Hierarchical dimension order for the resulting dataframe. All
4577
+ arrays are transposed to this order and then written out as flat
4578
+ vectors in contiguous order, so the last dimension in this list
4579
+ will be contiguous in the resulting DataFrame. This has a major
4580
+ influence on which operations are efficient on the resulting
4581
+ dataframe.
4582
+
4583
+ If provided, must include all dimensions of this dataset. By
4584
+ default, dimensions are sorted alphabetically.
4585
+
4586
+ Returns
4587
+ -------
4588
+ result
4589
+ Dataset as a pandas DataFrame.
4590
+
4542
4591
"""
4543
- return self ._to_dataframe (self .dims )
4592
+
4593
+ ordered_dims = self ._normalize_dim_order (dim_order = dim_order )
4594
+
4595
+ return self ._to_dataframe (ordered_dims = ordered_dims )
4544
4596
4545
4597
def _set_sparse_data_from_dataframe (
4546
4598
self , idx : pd .Index , arrays : List [Tuple [Hashable , np .ndarray ]], dims : tuple
@@ -4694,11 +4746,11 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
4694
4746
influence on which operations are efficient on the resulting dask
4695
4747
dataframe.
4696
4748
4697
- If provided, must include all dimensions on this dataset. By
4749
+ If provided, must include all dimensions of this dataset. By
4698
4750
default, dimensions are sorted alphabetically.
4699
4751
set_index : bool, optional
4700
4752
If set_index=True, the dask DataFrame is indexed by this dataset's
4701
- coordinate. Since dask DataFrames to not support multi-indexes,
4753
+ coordinate. Since dask DataFrames do not support multi-indexes,
4702
4754
set_index only works if the dataset only contains one dimension.
4703
4755
4704
4756
Returns
@@ -4709,15 +4761,7 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
4709
4761
import dask .array as da
4710
4762
import dask .dataframe as dd
4711
4763
4712
- if dim_order is None :
4713
- dim_order = list (self .dims )
4714
- elif set (dim_order ) != set (self .dims ):
4715
- raise ValueError (
4716
- "dim_order {} does not match the set of dimensions on this "
4717
- "Dataset: {}" .format (dim_order , list (self .dims ))
4718
- )
4719
-
4720
- ordered_dims = {k : self .dims [k ] for k in dim_order }
4764
+ ordered_dims = self ._normalize_dim_order (dim_order = dim_order )
4721
4765
4722
4766
columns = list (ordered_dims )
4723
4767
columns .extend (k for k in self .coords if k not in self .dims )
@@ -4744,6 +4788,8 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
4744
4788
df = dd .concat (series_list , axis = 1 )
4745
4789
4746
4790
if set_index :
4791
+ dim_order = [* ordered_dims ]
4792
+
4747
4793
if len (dim_order ) == 1 :
4748
4794
(dim ,) = dim_order
4749
4795
df = df .set_index (dim )
0 commit comments