Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down
82 changes: 79 additions & 3 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,23 @@
)
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.common import pandas_dtype

import pandas as pd

if TYPE_CHECKING:
from collections.abc import Callable
from collections.abc import (
Callable,
Hashable,
Sequence,
)

import pyarrow

from pandas._typing import DtypeBackend
from pandas._typing import (
DtypeArg,
DtypeBackend,
)


def _arrow_dtype_mapping() -> dict:
Expand Down Expand Up @@ -64,6 +73,8 @@ def arrow_table_to_pandas(
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
null_to_int64: bool = False,
to_pandas_kwargs: dict | None = None,
dtype: DtypeArg | None = None,
names: Sequence[Hashable] | None = None,
) -> pd.DataFrame:
pa = import_optional_dependency("pyarrow")

Expand All @@ -82,12 +93,77 @@ def arrow_table_to_pandas(
elif using_string_dtype():
if pa_version_under19p0:
types_mapper = _arrow_string_types_mapper()
elif dtype is not None:
# GH#56136 Avoid lossy conversion to float64
# We'll convert to numpy below if
types_mapper = {
pa.int8(): pd.Int8Dtype(),
pa.int16(): pd.Int16Dtype(),
pa.int32(): pd.Int32Dtype(),
pa.int64(): pd.Int64Dtype(),
}.get
else:
types_mapper = None
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
types_mapper = None
if dtype is not None:
# GH#56136 Avoid lossy conversion to float64
# We'll convert to numpy below if
types_mapper = {
pa.int8(): pd.Int8Dtype(),
pa.int16(): pd.Int16Dtype(),
pa.int32(): pd.Int32Dtype(),
pa.int64(): pd.Int64Dtype(),
}.get
else:
types_mapper = None
else:
raise NotImplementedError

df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
return _post_convert_dtypes(df, dtype_backend, dtype, names)


def _post_convert_dtypes(
df: pd.DataFrame,
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault,
dtype: DtypeArg | None,
names: Sequence[Hashable] | None,
) -> pd.DataFrame:
if dtype is not None and (
dtype_backend is lib.no_default or dtype_backend == "numpy"
):
# GH#56136 apply any user-provided dtype, and convert any IntegerDtype
# columns the user didn't explicitly ask for.
if isinstance(dtype, dict):
if names is not None:
df.columns = names

cmp_dtypes = {
pd.Int8Dtype(),
pd.Int16Dtype(),
pd.Int32Dtype(),
pd.Int64Dtype(),
}
for col in df.columns:
if col not in dtype and df[col].dtype in cmp_dtypes:
# Any key that the user didn't explicitly specify
# that got converted to IntegerDtype now gets converted
# to numpy dtype.
dtype[col] = df[col].dtype.numpy_dtype

# Ignore non-existent columns from dtype mapping
# like other parsers do
dtype = {
key: pandas_dtype(dtype[key]) for key in dtype if key in df.columns
}

else:
dtype = pandas_dtype(dtype)

try:
df = df.astype(dtype)
except TypeError as err:
# GH#44901 reraise to keep api consistent
raise ValueError(str(err)) from err

return df
37 changes: 25 additions & 12 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.common import (
pandas_dtype,
)
from pandas.core.dtypes.inference import is_integer

from pandas.io._util import arrow_table_to_pandas
from pandas.io.parsers.base_parser import ParserBase

if TYPE_CHECKING:
import pyarrow as pa

from pandas._typing import ReadBuffer

from pandas import DataFrame
Expand Down Expand Up @@ -162,13 +166,12 @@ def _get_convert_options(self):

return convert_options

def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
num_cols = len(frame.columns)
def _adjust_column_names(self, table: pa.Table) -> bool:
num_cols = len(table.columns)
multi_index_named = True
if self.header is None:
if self.names is None:
if self.header is None:
self.names = range(num_cols)
self.names = range(num_cols)
if len(self.names) != num_cols:
# usecols is passed through to pyarrow, we only handle index col here
# The only way self.names is not the same length as number of cols is
Expand All @@ -177,8 +180,7 @@ def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
self.names = columns_prefix + self.names
multi_index_named = False
frame.columns = self.names
return frame, multi_index_named
return multi_index_named

def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame:
if self.index_col is not None:
Expand Down Expand Up @@ -227,21 +229,23 @@ def _finalize_dtype(self, frame: DataFrame) -> DataFrame:
raise ValueError(str(err)) from err
return frame

def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
def _finalize_pandas_output(
self, frame: DataFrame, multi_index_named: bool
) -> DataFrame:
"""
Processes data read in based on kwargs.

Parameters
----------
frame: DataFrame
frame : DataFrame
The DataFrame to process.
multi_index_named : bool

Returns
-------
DataFrame
The processed DataFrame.
"""
frame, multi_index_named = self._adjust_column_names(frame)
frame = self._do_date_conversions(frame.columns, frame)
frame = self._finalize_index(frame, multi_index_named)
frame = self._finalize_dtype(frame)
Expand Down Expand Up @@ -299,14 +303,23 @@ def read(self) -> DataFrame:

table = table.cast(new_schema)

multi_index_named = self._adjust_column_names(table)

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"make_block is deprecated",
DeprecationWarning,
)
frame = arrow_table_to_pandas(
table, dtype_backend=dtype_backend, null_to_int64=True
table,
dtype_backend=dtype_backend,
null_to_int64=True,
dtype=self.dtype,
names=self.names,
)

return self._finalize_pandas_output(frame)
if self.header is None:
frame.columns = self.names

return self._finalize_pandas_output(frame, multi_index_named)
4 changes: 0 additions & 4 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request):
tm.assert_frame_equal(result, expected)


# pyarrow engine failing:
# https://github.com/pandas-dev/pandas/issues/56136
@pytest.mark.usefixtures("pyarrow_xfail")
def test_ea_int_avoid_overflow(all_parsers):
# GH#32134
parser = all_parsers
Expand Down Expand Up @@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand Down
22 changes: 19 additions & 3 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,11 +670,16 @@ def test_inf_na_values_with_int_index(all_parsers):
tm.assert_frame_equal(out, expected)


@xfail_pyarrow # mismatched shape
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
def test_na_values_with_dtype_str_and_na_filter(
all_parsers, na_filter, using_infer_string, request
):
# see gh-20377
parser = all_parsers
if parser.engine == "pyarrow" and (na_filter is False or not using_infer_string):
mark = pytest.mark.xfail(reason="mismatched shape")
request.applymarker(mark)

data = "a,b,c\n1,,3\n4,5,6"

# na_filter=True --> missing value becomes NaN.
Expand Down Expand Up @@ -798,7 +803,18 @@ def test_bool_and_nan_to_int(all_parsers):
True
False
"""
with pytest.raises(ValueError, match="convert|NoneType"):
msg = (
"cannot safely convert passed user dtype of int(64|32) for "
"<class 'numpy.bool_?'> dtyped data in column 0 due to NA values"
)
if parser.engine == "python":
msg = "Unable to convert column 0 to type int(64|32)"
elif parser.engine == "pyarrow":
msg = (
r"int\(\) argument must be a string, a bytes-like object or a "
"real number, not 'NoneType"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype="int")


Expand Down
Loading