|
27 | 27 | from itertools import imap as map |
28 | 28 | from cgi import escape as html_escape |
29 | 29 |
|
| 30 | +from collections import Counter |
30 | 31 | import warnings |
31 | 32 |
|
32 | 33 | from pyspark import copy_func, since, _NoValue |
@@ -2148,21 +2149,48 @@ def toPandas(self): |
2148 | 2149 |
|
2149 | 2150 | # Below is toPandas without Arrow optimization. |
2150 | 2151 | pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns) |
| 2152 | + column_counter = Counter(self.columns) |
| 2153 | + |
| 2154 | + dtype = [None] * len(self.schema) |
| 2155 | + for fieldIdx, field in enumerate(self.schema): |
| 2156 | + # For duplicate column name, we use `iloc` to access it. |
| 2157 | + if column_counter[field.name] > 1: |
| 2158 | + pandas_col = pdf.iloc[:, fieldIdx] |
| 2159 | + else: |
| 2160 | + pandas_col = pdf[field.name] |
2151 | 2161 |
|
2152 | | - dtype = {} |
2153 | | - for field in self.schema: |
2154 | 2162 | pandas_type = _to_corrected_pandas_type(field.dataType) |
2155 | 2163 | # SPARK-21766: if an integer field is nullable and has null values, it can be |
2156 | 2164 | # inferred by pandas as float column. Once we convert the column with NaN back |
2157 | 2165 | # to integer type e.g., np.int16, we will hit exception. So we use the inferred |
2158 | 2166 | # float type, not the corrected type from the schema in this case. |
2159 | 2167 | if pandas_type is not None and \ |
2160 | 2168 | not(isinstance(field.dataType, IntegralType) and field.nullable and |
2161 | | - pdf[field.name].isnull().any()): |
2162 | | - dtype[field.name] = pandas_type |
| 2169 | + pandas_col.isnull().any()): |
| 2170 | + dtype[fieldIdx] = pandas_type |
| 2171 | + |
| 2172 | + df = pd.DataFrame() |
| 2173 | + for index, t in enumerate(dtype): |
| 2174 | + column_name = self.schema[index].name |
| 2175 | + |
| 2176 | + # For duplicate column name, we use `iloc` to access it. |
| 2177 | + if column_counter[column_name] > 1: |
| 2178 | + series = pdf.iloc[:, index] |
| 2179 | + else: |
| 2180 | + series = pdf[column_name] |
| 2181 | + |
| 2182 | + if t is not None: |
| 2183 | + series = series.astype(t, copy=False) |
| 2184 | + |
| 2185 | + # `insert` API makes copy of data, we only do it for Series of duplicate column names. |
| 2186 | + # `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work because `iloc` could |
| 2187 | + # return a view or a copy depending by context. |
| 2188 | + if column_counter[column_name] > 1: |
| 2189 | + df.insert(index, column_name, series, allow_duplicates=True) |
| 2190 | + else: |
| 2191 | + df[column_name] = series |
2163 | 2192 |
|
2164 | | - for f, t in dtype.items(): |
2165 | | - pdf[f] = pdf[f].astype(t, copy=False) |
| 2193 | + pdf = df |
2166 | 2194 |
|
2167 | 2195 | if timezone is None: |
2168 | 2196 | return pdf |
|
0 commit comments