From 97b751c9baff792acffd4896480d8b0e8102f263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sun, 9 Aug 2020 12:47:34 -0400 Subject: [PATCH 1/2] io/common: use gzip.GzipFile instead of gzip.open --- doc/source/user_guide/io.rst | 11 +++++--- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/generic.py | 6 ++++ pandas/io/common.py | 7 ++--- pandas/tests/io/test_compression.py | 43 +++++++++++++++++++++++++++++ 5 files changed, 59 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 35403b5c8b66f..43030d76d945a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -287,16 +287,19 @@ Quoting, compression, and file format compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, - bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', + bz2, zip, or xz if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2', '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` - set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to - compression settings. As an example, the following could be passed for - faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``. + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``} and other key-value pairs are + forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, or ``bz2.BZ2File``. + As an example, the following could be passed for faster compression and to + create a reproducible gzip archive: + ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. .. versionchanged:: 0.24.0 'infer' option added and set to default. .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. + .. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to `gzip.open`. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index deb5697053ea8..6612f741d925d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -235,6 +235,7 @@ I/O - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) +- :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) Plotting ^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 11147bffa32c3..34fce4e398864 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3144,6 +3144,12 @@ def to_csv( Compression is supported for binary file objects. + .. versionchanged:: 1.2.0 + + Previous versions forwarded dict entries for 'gzip' to + `gzip.open` instead of `gzip.GzipFile` which prevented + setting `mtime`. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC diff --git a/pandas/io/common.py b/pandas/io/common.py index 9ac642e58b544..8b48cbbdb56a8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -464,16 +464,13 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode, **compression_args) + f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args) else: f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) # BZ Compression elif compression == "bz2": - if is_path: - f = bz2.BZ2File(path_or_buf, mode, **compression_args) - else: - f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) + f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) # ZIP Compression elif compression == "zip": diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 902a3d5d2a397..bc14b485f75e5 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,7 +1,10 @@ +import io import os +from pathlib import Path import subprocess import sys import textwrap +import time import pytest @@ -130,6 +133,46 @@ def test_compression_binary(compression_only): ) +def test_gzip_reproducibility_file_name(): + """ + Gzip should create reproducible archives with mtime. + + Note: Archives created with different filenames will still be different! + + GH 28103 + """ + df = tm.makeDataFrame() + compression_options = {"method": "gzip", "mtime": 1} + + # test for filename + with tm.ensure_clean() as path: + path = Path(path) + df.to_csv(path, compression=compression_options) + time.sleep(2) + output = path.read_bytes() + df.to_csv(path, compression=compression_options) + assert output == path.read_bytes() + + +def test_gzip_reproducibility_file_object(): + """ + Gzip should create reproducible archives with mtime. + + GH 28103 + """ + df = tm.makeDataFrame() + compression_options = {"method": "gzip", "mtime": 1} + + # test for file object + buffer = io.BytesIO() + df.to_csv(buffer, compression=compression_options, mode="wb") + output = buffer.getvalue() + time.sleep(2) + buffer = io.BytesIO() + df.to_csv(buffer, compression=compression_options, mode="wb") + assert output == buffer.getvalue() + + def test_with_missing_lzma(): """Tests if import pandas works when lzma is not present.""" # https://github.com/pandas-dev/pandas/issues/27575 From 8204c88f74da437326fc23d27c75dcddcce7135e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 12 Aug 2020 21:09:23 -0400 Subject: [PATCH 2/2] typing for compression --- pandas/_typing.py | 5 +++++ pandas/core/generic.py | 7 ++++--- pandas/io/common.py | 24 +++++++++++++++--------- pandas/io/formats/csvs.py | 6 +++--- pandas/io/json/_json.py | 31 ++++++++++++++++++++++--------- pandas/io/pickle.py | 8 ++++---- pandas/io/stata.py | 19 ++++++------------- 7 files changed, 59 insertions(+), 41 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 47a102ddc70e0..1b972030ef5a5 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -109,3 +109,8 @@ # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] + + +# compression keywords and compression +CompressionDict = Mapping[str, Optional[Union[str, int, bool]]] +CompressionOptions = Optional[Union[str, CompressionDict]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 34fce4e398864..2219d54477d9e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -35,6 +35,7 @@ from pandas._libs.tslibs import Tick, Timestamp, to_offset from pandas._typing import ( Axis, + CompressionOptions, FilePathOrBuffer, FrameOrSeries, JSONSerializable, @@ -2058,7 +2059,7 @@ def to_json( date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool_t = False, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", index: bool_t = True, indent: Optional[int] = None, storage_options: StorageOptions = None, @@ -2646,7 +2647,7 @@ def to_sql( def to_pickle( self, path, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: @@ -3053,7 +3054,7 @@ def to_csv( index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, mode: str = "w", encoding: Optional[str] = None, - compression: Optional[Union[str, Mapping[str, str]]] = "infer", + compression: CompressionOptions = "infer", quoting: Optional[int] = None, quotechar: str = '"', line_terminator: Optional[str] = None, diff --git a/pandas/io/common.py b/pandas/io/common.py index 8b48cbbdb56a8..54f35e689aac8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,7 +18,6 @@ Optional, Tuple, Type, - Union, ) from urllib.parse import ( urljoin, @@ -29,7 +28,12 @@ ) import zipfile -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import ( + CompressionDict, + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) from pandas.compat import _get_lzma_file, _import_lzma from pandas.compat._optional import import_optional_dependency @@ -160,7 +164,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, - compression: Optional[str] = None, + compression: CompressionOptions = None, mode: Optional[str] = None, storage_options: StorageOptions = None, ): @@ -188,7 +192,7 @@ def get_filepath_or_buffer( Returns ------- - Tuple[FilePathOrBuffer, str, str, bool] + Tuple[FilePathOrBuffer, str, CompressionOptions, bool] Tuple containing the filepath or buffer, the encoding, the compression and should_close. """ @@ -291,8 +295,8 @@ def file_path_to_url(path: str) -> str: def get_compression_method( - compression: Optional[Union[str, Mapping[str, Any]]] -) -> Tuple[Optional[str], Dict[str, Any]]: + compression: CompressionOptions, +) -> Tuple[Optional[str], CompressionDict]: """ Simplifies a compression argument to a compression method string and a mapping containing additional arguments. @@ -316,7 +320,7 @@ def get_compression_method( if isinstance(compression, Mapping): compression_args = dict(compression) try: - compression_method = compression_args.pop("method") + compression_method = compression_args.pop("method") # type: ignore except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: @@ -383,7 +387,7 @@ def get_handle( path_or_buf, mode: str, encoding=None, - compression: Optional[Union[str, Mapping[str, Any]]] = None, + compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors=None, @@ -574,7 +578,9 @@ def __init__( if mode in ["wb", "rb"]: mode = mode.replace("b", "") self.archive_name = archive_name - super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) + kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} + kwargs_zip.update(kwargs) + super().__init__(file, mode, **kwargs_zip) def write(self, data): archive_name = self.filename diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6eceb94387171..c462a96da7133 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,13 +5,13 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Hashable, List, Mapping, Optional, Sequence, Union +from typing import Hashable, List, Optional, Sequence, Union import warnings import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -44,7 +44,7 @@ def __init__( mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 0d2b351926343..c2bd6302940bb 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,13 +3,13 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type +from typing import IO, Any, Callable, List, Optional, Type import numpy as np import pandas._libs.json as json from pandas._libs.tslibs import iNaT -from pandas._typing import JSONSerializable, StorageOptions +from pandas._typing import CompressionOptions, JSONSerializable, StorageOptions from pandas.errors import AbstractMethodError from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments @@ -19,7 +19,12 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.reshape.concat import concat -from pandas.io.common import get_filepath_or_buffer, get_handle, infer_compression +from pandas.io.common import ( + get_compression_method, + get_filepath_or_buffer, + get_handle, + infer_compression, +) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer @@ -41,7 +46,7 @@ def to_json( date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool = False, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", index: bool = True, indent: int = 0, storage_options: StorageOptions = None, @@ -369,7 +374,7 @@ def read_json( encoding=None, lines: bool = False, chunksize: Optional[int] = None, - compression="infer", + compression: CompressionOptions = "infer", nrows: Optional[int] = None, storage_options: StorageOptions = None, ): @@ -607,7 +612,9 @@ def read_json( if encoding is None: encoding = "utf-8" - compression = infer_compression(path_or_buf, compression) + compression_method, compression = get_compression_method(compression) + compression_method = infer_compression(path_or_buf, compression_method) + compression = dict(compression, method=compression_method) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, @@ -667,10 +674,13 @@ def __init__( encoding, lines: bool, chunksize: Optional[int], - compression, + compression: CompressionOptions, nrows: Optional[int], ): + compression_method, compression = get_compression_method(compression) + compression = dict(compression, method=compression_method) + self.orient = orient self.typ = typ self.dtype = dtype @@ -687,6 +697,7 @@ def __init__( self.nrows_seen = 0 self.should_close = False self.nrows = nrows + self.file_handles: List[IO] = [] if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) @@ -735,8 +746,8 @@ def _get_data_from_filepath(self, filepath_or_buffer): except (TypeError, ValueError): pass - if exists or self.compression is not None: - data, _ = get_handle( + if exists or self.compression["method"] is not None: + data, self.file_handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, @@ -816,6 +827,8 @@ def close(self): self.open_stream.close() except (IOError, AttributeError): pass + for file_handle in self.file_handles: + file_handle.close() def __next__(self): if self.nrows: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index eee6ec7c9feca..fc1d2e385cf72 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,9 +1,9 @@ """ pickle compat """ import pickle -from typing import Any, Optional +from typing import Any import warnings -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle @@ -12,7 +12,7 @@ def to_pickle( obj: Any, filepath_or_buffer: FilePathOrBuffer, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): @@ -114,7 +114,7 @@ def to_pickle( def read_pickle( filepath_or_buffer: FilePathOrBuffer, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): """ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7a25617885839..ec3819f1673a8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, Label, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1938,9 +1938,9 @@ def read_stata( def _open_file_binary_write( fname: FilePathOrBuffer, - compression: Union[str, Mapping[str, str], None], + compression: CompressionOptions, storage_options: StorageOptions = None, -) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]: +) -> Tuple[BinaryIO, bool, CompressionOptions]: """ Open a binary file or no-op if file-like. @@ -1978,17 +1978,10 @@ def _open_file_binary_write( # Extract compression mode as given, if dict compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) - path_or_buf, _, compression_typ, _ = get_filepath_or_buffer( - fname, - mode="wb", - compression=compression_typ, - storage_options=storage_options, + compression = dict(compression_args, method=compression_typ) + path_or_buf, _, compression, _ = get_filepath_or_buffer( + fname, mode="wb", compression=compression, storage_options=storage_options, ) - if compression_typ is not None: - compression = compression_args - compression["method"] = compression_typ - else: - compression = None f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False) return f, True, compression else: