Skip to content

BUG: read_csv with custom date parser and na_filter=True results in ValueError #36111

@smith1401

Description

@smith1401
import numpy as np
import pandas as pd
from io import StringIO

def __custom_date_parser(time):
    time_temp = time.astype(np.float).astype(np.int) # convert float seconds to int type
    return pd.to_timedelta(time_temp, unit='s')

testdata = StringIO("""time    e   n   h
41047.00	-98573.7297	871458.0640	389.0089
41048.00	-98573.7299	871458.0640	389.0089
41049.00	-98573.7300	871458.0642	389.0088
41050.00	-98573.7299	871458.0643	389.0088
41051.00	-98573.7302	871458.0640	389.0086
    """)

df = pd.read_csv(testdata, delim_whitespace=True, parse_dates=True, date_parser=__custom_date_parser, index_col='time')

I noticed this problem when I executed a piece of old code which has worked before (a few months ago). Normally this code would parse a text file with GPS seconds of week as time and convert it to a TimeDeltaIndex. Now when I execute this, it results in a ValueError: unit abbreviation w/o a number. (Full stack trace below) I tracked it down to the default option na_filter=True in pd.read_csv. When i set it to False everything is working. With a bit of digging I think i found the source of the error in algorithms.py -> _ensure_data -> line 142.

    # datetimelike
    vals_dtype = getattr(values, "dtype", None)
    if needs_i8_conversion(vals_dtype) or needs_i8_conversion(dtype):
        if is_period_dtype(vals_dtype) or is_period_dtype(dtype):
            from pandas import PeriodIndex

            values = PeriodIndex(values)
            dtype = values.dtype
        elif is_timedelta64_dtype(vals_dtype) or is_timedelta64_dtype(dtype):
            from pandas import TimedeltaIndex

            values = TimedeltaIndex(values)  #This is line 142
            dtype = values.dtype
        else:
            # Datetime
            if values.ndim > 1 and is_datetime64_ns_dtype(vals_dtype):
                # Avoid calling the DatetimeIndex constructor as it is 1D only
                # Note: this is reached by DataFrame.rank calls GH#27027
                # TODO(EA2D): special case not needed with 2D EAs
                asi8 = values.view("i8")
                dtype = values.dtype
                return asi8, dtype

            from pandas import DatetimeIndex

            values = DatetimeIndex(values)
            dtype = values.dtype

Here the function tries to parse values as TimeDeltaIndex, but values is ['' 'n/a' '-nan' '#N/A' '1.#QNAN' 'nan' '#NA' 'NaN' '-1.#QNAN' '#N/A N/A', '-NaN' 'N/A' 'NULL' '' 'null' '1.#IND' 'NA' '-1.#IND'] in this case. It executes this if statement, because is_timedelta64_dtype(dtype) is true in this case. I can't believe that this is expected behaviour, as it has worked before.

Traceback (most recent call last):
  File "...\lib\site-packages\pandas\io\parsers.py", line 458, in _read
    data = parser.read(nrows)
  File "...\lib\site-packages\pandas\io\parsers.py", line 1186, in read
    ret = self._engine.read(nrows)
  File "...\lib\site-packages\pandas\io\parsers.py", line 2221, in read
    index, names = self._make_index(data, alldata, names)
  File "...\lib\site-packages\pandas\io\parsers.py", line 1667, in _make_index
    index = self._agg_index(index)
  File "...\lib\site-packages\pandas\io\parsers.py", line 1760, in _agg_index
    arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
  File "...\lib\site-packages\pandas\io\parsers.py", line 1861, in _infer_types
    mask = algorithms.isin(values, list(na_values))
  File "...\lib\site-packages\pandas\core\algorithms.py", line 433, in isin
    values, _ = _ensure_data(values, dtype=dtype)
  File "...\lib\site-packages\pandas\core\algorithms.py", line 142, in _ensure_data
    values = TimedeltaIndex(values)
  File "...\lib\site-packages\pandas\core\indexes\timedeltas.py", line 157, in __new__
    data, freq=freq, unit=unit, dtype=dtype, copy=copy
  File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 216, in _from_sequence
    data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit)
  File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 930, in sequence_to_td64ns
    data = objects_to_td64ns(data, unit=unit, errors=errors)
  File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 1040, in objects_to_td64ns
    result = array_to_timedelta64(values, unit=unit, errors=errors)
  File "pandas\_libs\tslibs\timedeltas.pyx", line 273, in pandas._libs.tslibs.timedeltas.array_to_timedelta64
  File "pandas\_libs\tslibs\timedeltas.pyx", line 268, in pandas._libs.tslibs.timedeltas.array_to_timedelta64
  File "pandas\_libs\tslibs\timedeltas.pyx", line 215, in pandas._libs.tslibs.timedeltas.convert_to_timedelta64
  File "pandas\_libs\tslibs\timedeltas.pyx", line 428, in pandas._libs.tslibs.timedeltas.parse_timedelta_string
ValueError: unit abbreviation w/o a number
python-BaseException

INSTALLED VERSIONS

commit : f2ca0a2
python : 3.7.9.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.18362
machine : AMD64
processor : Intel64 Family 6 Model 94 Stepping 3, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.None
pandas : 1.1.1
numpy : 1.18.1
pytz : 2020.1
dateutil : 2.8.1
pip : 20.2.2
setuptools : 49.6.0.post20200814
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : None
pandas_datareader: None
bs4 : None
bottleneck : None
fsspec : None
fastparquet : None
gcsfs : None
matplotlib : 3.1.2
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
numba : None

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions