11import abc
22import datetime
3+ from distutils .version import LooseVersion
34import inspect
45from io import BufferedIOBase , BytesIO , RawIOBase
56import os
67from textwrap import fill
7- from typing import Any , Dict , Mapping , Union , cast
8+ from typing import IO , Any , Dict , Mapping , Optional , Union , cast
89import warnings
10+ import zipfile
911
1012from pandas ._config import config
1113
1214from pandas ._libs .parsers import STR_NA_VALUES
1315from pandas ._typing import Buffer , FilePathOrBuffer , StorageOptions
1416from pandas .compat ._optional import import_optional_dependency
1517from pandas .errors import EmptyDataError
16- from pandas .util ._decorators import Appender , deprecate_nonkeyword_arguments
18+ from pandas .util ._decorators import Appender , deprecate_nonkeyword_arguments , doc
1719
1820from pandas .core .dtypes .common import is_bool , is_float , is_integer , is_list_like
1921
2022from pandas .core .frame import DataFrame
23+ from pandas .core .shared_docs import _shared_docs
2124
2225from pandas .io .common import IOHandles , get_handle , stringify_path , validate_header_arg
2326from pandas .io .excel ._util import (
116119 When ``engine=None``, the following logic will be
117120 used to determine the engine:
118121
119- - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
120- then `odf <https://pypi.org/project/odfpy/>`_ will be used.
121- - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
122- extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
123- be used.
124- - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
125- then ``openpyxl`` will be used.
126- - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
127-
128- Specifying ``engine="xlrd"`` will continue to be allowed for the
129- indefinite future.
122+ - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
123+ then `odf <https://pypi.org/project/odfpy/>`_ will be used.
124+ - Otherwise if ``path_or_buffer`` is an xls format,
125+ ``xlrd`` will be used.
126+ - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
127+ then ``openpyxl`` will be used.
128+ - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
129+ - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This
130+ case will raise a ``ValueError`` in a future version of pandas.
130131
131132converters : dict, default None
132133 Dict of functions for converting values in certain columns. Keys can
@@ -888,39 +889,92 @@ def close(self):
888889 return content
889890
890891
891- def _is_ods_stream (stream : Union [BufferedIOBase , RawIOBase ]) -> bool :
892+ XLS_SIGNATURE = b"\xD0 \xCF \x11 \xE0 \xA1 \xB1 \x1A \xE1 "
893+ ZIP_SIGNATURE = b"PK\x03 \x04 "
894+ PEEK_SIZE = max (len (XLS_SIGNATURE ), len (ZIP_SIGNATURE ))
895+
896+
897+ @doc (storage_options = _shared_docs ["storage_options" ])
898+ def inspect_excel_format (
899+ path : Optional [str ] = None ,
900+ content : Union [None , BufferedIOBase , RawIOBase , bytes ] = None ,
901+ storage_options : StorageOptions = None ,
902+ ) -> str :
892903 """
893- Check if the stream is an OpenDocument Spreadsheet (.ods) file
904+ Inspect the path or content of an excel file and get its format.
905+
906+ At least one of path or content must be not None. If both are not None,
907+ content will take precedence.
894908
895- It uses magic values inside the stream
909+ Adopted from xlrd: https://github.com/python-excel/xlrd.
896910
897911 Parameters
898912 ----------
899- stream : Union[BufferedIOBase, RawIOBase]
900- IO stream with data which might be an ODS file
913+ path : str, optional
914+ Path to file to inspect. May be a URL.
915+ content : file-like object, optional
916+ Content of file to inspect.
917+ {storage_options}
901918
902919 Returns
903920 -------
904- is_ods : bool
905- Boolean indication that this is indeed an ODS file or not
921+ str
922+ Format of file.
923+
924+ Raises
925+ ------
926+ ValueError
927+ If resulting stream is empty.
928+ BadZipFile
929+ If resulting stream does not have an XLS signature and is not a valid zipfile.
906930 """
907- stream .seek (0 )
908- is_ods = False
909- if stream .read (4 ) == b"PK\003 \004 " :
910- stream .seek (30 )
911- is_ods = (
912- stream .read (54 ) == b"mimetype"
913- b"application/vnd.oasis.opendocument.spreadsheet"
914- )
915- stream .seek (0 )
916- return is_ods
931+ content_or_path : Union [None , str , BufferedIOBase , RawIOBase , IO [bytes ]]
932+ if isinstance (content , bytes ):
933+ content_or_path = BytesIO (content )
934+ else :
935+ content_or_path = content or path
936+ assert content_or_path is not None
937+
938+ with get_handle (
939+ content_or_path , "rb" , storage_options = storage_options , is_text = False
940+ ) as handle :
941+ stream = handle .handle
942+ stream .seek (0 )
943+ buf = stream .read (PEEK_SIZE )
944+ if buf is None :
945+ raise ValueError ("stream is empty" )
946+ else :
947+ assert isinstance (buf , bytes )
948+ peek = buf
949+ stream .seek (0 )
950+
951+ if peek .startswith (XLS_SIGNATURE ):
952+ return "xls"
953+ elif not peek .startswith (ZIP_SIGNATURE ):
954+ raise ValueError ("File is not a recognized excel file" )
955+
956+ # ZipFile typing is overly-strict
957+ # https://github.com/python/typeshed/issues/4212
958+ zf = zipfile .ZipFile (stream ) # type: ignore[arg-type]
959+
960+ # Workaround for some third party files that use forward slashes and
961+ # lower case names.
962+ component_names = [name .replace ("\\ " , "/" ).lower () for name in zf .namelist ()]
963+
964+ if "xl/workbook.xml" in component_names :
965+ return "xlsx"
966+ if "xl/workbook.bin" in component_names :
967+ return "xlsb"
968+ if "content.xml" in component_names :
969+ return "ods"
970+ return "zip"
917971
918972
919973class ExcelFile :
920974 """
921975 Class for parsing tabular excel sheets into DataFrame objects.
922976
923- See read_excel for more documentation
977+ See read_excel for more documentation.
924978
925979 Parameters
926980 ----------
@@ -947,12 +1001,13 @@ class ExcelFile:
9471001
9481002 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
9491003 then `odf <https://pypi.org/project/odfpy/>`_ will be used.
950- - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
951- extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
952- will be used.
1004+ - Otherwise if ``path_or_buffer`` is an xls format,
1005+ ``xlrd`` will be used.
9531006 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
9541007 then ``openpyxl`` will be used.
1008+ - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
9551009 - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
1010+ This case will raise a ``ValueError`` in a future version of pandas.
9561011
9571012 .. warning::
9581013
@@ -975,71 +1030,87 @@ class ExcelFile:
9751030 def __init__ (
9761031 self , path_or_buffer , engine = None , storage_options : StorageOptions = None
9771032 ):
978- if engine is None :
979- # Determine ext and use odf for ods stream/file
980- if isinstance (path_or_buffer , (BufferedIOBase , RawIOBase )):
981- ext = None
982- if _is_ods_stream (path_or_buffer ):
983- engine = "odf"
984- else :
985- ext = os .path .splitext (str (path_or_buffer ))[- 1 ]
986- if ext == ".ods" :
987- engine = "odf"
1033+ if engine is not None and engine not in self ._engines :
1034+ raise ValueError (f"Unknown engine: { engine } " )
9881035
989- if (
990- import_optional_dependency (
991- "xlrd" , raise_on_missing = False , on_version = "ignore"
992- )
993- is not None
994- ):
995- from xlrd import Book
1036+ # Could be a str, ExcelFile, Book, etc.
1037+ self .io = path_or_buffer
1038+ # Always a string
1039+ self ._io = stringify_path (path_or_buffer )
9961040
997- if isinstance (path_or_buffer , Book ):
998- engine = "xlrd"
1041+ # Determine xlrd version if installed
1042+ if (
1043+ import_optional_dependency (
1044+ "xlrd" , raise_on_missing = False , on_version = "ignore"
1045+ )
1046+ is None
1047+ ):
1048+ xlrd_version = None
1049+ else :
1050+ import xlrd
9991051
1000- # GH 35029 - Prefer openpyxl except for xls files
1001- if engine is None :
1002- if ext is None or isinstance (path_or_buffer , bytes ) or ext == ".xls" :
1003- engine = "xlrd"
1004- elif (
1052+ xlrd_version = LooseVersion (xlrd .__version__ )
1053+
1054+ if isinstance (path_or_buffer , (BufferedIOBase , RawIOBase , bytes )):
1055+ ext = inspect_excel_format (
1056+ content = path_or_buffer , storage_options = storage_options
1057+ )
1058+ elif xlrd_version is not None and isinstance (path_or_buffer , xlrd .Book ):
1059+ ext = "xls"
1060+ else :
1061+ # path_or_buffer is path-like, use stringified path
1062+ ext = inspect_excel_format (
1063+ path = str (self ._io ), storage_options = storage_options
1064+ )
1065+
1066+ if engine is None :
1067+ if ext == "ods" :
1068+ engine = "odf"
1069+ elif ext == "xls" :
1070+ engine = "xlrd"
1071+ else :
1072+ # GH 35029 - Prefer openpyxl except for xls files
1073+ if (
10051074 import_optional_dependency (
10061075 "openpyxl" , raise_on_missing = False , on_version = "ignore"
10071076 )
10081077 is not None
10091078 ):
10101079 engine = "openpyxl"
10111080 else :
1012- caller = inspect .stack ()[1 ]
1013- if (
1014- caller .filename .endswith ("pandas/io/excel/_base.py" )
1015- and caller .function == "read_excel"
1016- ):
1017- stacklevel = 4
1018- else :
1019- stacklevel = 2
1020- warnings .warn (
1021- "The xlrd engine is no longer maintained and is not "
1022- "supported when using pandas with python >= 3.9. However, "
1023- "the engine xlrd will continue to be allowed for the "
1024- "indefinite future. Beginning with pandas 1.2.0, the "
1025- "openpyxl engine will be used if it is installed and the "
1026- "engine argument is not specified. Either install openpyxl "
1027- "or specify engine='xlrd' to silence this warning." ,
1028- FutureWarning ,
1029- stacklevel = stacklevel ,
1030- )
10311081 engine = "xlrd"
1032- if engine not in self ._engines :
1033- raise ValueError (f"Unknown engine: { engine } " )
1082+
1083+ if engine == "xlrd" and ext != "xls" and xlrd_version is not None :
1084+ if xlrd_version >= "2" :
1085+ raise ValueError (
1086+ f"Your version of xlrd is { xlrd_version } . In xlrd >= 2.0, "
1087+ f"only the xls format is supported. Install openpyxl instead."
1088+ )
1089+ else :
1090+ caller = inspect .stack ()[1 ]
1091+ if (
1092+ caller .filename .endswith (
1093+ os .path .join ("pandas" , "io" , "excel" , "_base.py" )
1094+ )
1095+ and caller .function == "read_excel"
1096+ ):
1097+ stacklevel = 4
1098+ else :
1099+ stacklevel = 2
1100+ warnings .warn (
1101+ f"Your version of xlrd is { xlrd_version } . In xlrd >= 2.0, "
1102+ f"only the xls format is supported. As a result, the "
1103+ f"openpyxl engine will be used if it is installed and the "
1104+ f"engine argument is not specified. Install "
1105+ f"openpyxl instead." ,
1106+ FutureWarning ,
1107+ stacklevel = stacklevel ,
1108+ )
1109+ assert engine in self ._engines , f"Engine { engine } not recognized"
10341110
10351111 self .engine = engine
10361112 self .storage_options = storage_options
10371113
1038- # Could be a str, ExcelFile, Book, etc.
1039- self .io = path_or_buffer
1040- # Always a string
1041- self ._io = stringify_path (path_or_buffer )
1042-
10431114 self ._reader = self ._engines [engine ](self ._io , storage_options = storage_options )
10441115
10451116 def __fspath__ (self ):
0 commit comments