Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 40 additions & 6 deletions lib/iris/fileformats/_ff.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def __init__(self, filename, word_depth=DEFAULT_FF_WORD_DEPTH):
# Read the FF header data
with open(filename, "rb") as ff_file:
# typically 64-bit words (aka. int64 or ">i8")
header_data = np.fromfile(
header_data = _parse_binary_stream(
ff_file,
dtype=">i{0}".format(word_depth),
count=FF_HEADER_DEPTH,
Expand All @@ -351,19 +351,19 @@ def __init__(self, filename, word_depth=DEFAULT_FF_WORD_DEPTH):
ff_file.seek((addr[0] - 1) * word_depth, os.SEEK_SET)
if len(addr) == 2:
if elem == "integer_constants":
res = np.fromfile(
res = _parse_binary_stream(
ff_file,
dtype=">i{0}".format(word_depth),
count=addr[1],
)
else:
res = np.fromfile(
res = _parse_binary_stream(
ff_file,
dtype=">f{0}".format(word_depth),
count=addr[1],
)
elif len(addr) == 3:
res = np.fromfile(
res = _parse_binary_stream(
ff_file,
dtype=">f{0}".format(word_depth),
count=addr[1] * addr[2],
Expand Down Expand Up @@ -695,7 +695,7 @@ def _extract_field(self):
ff_file_seek(table_offset, os.SEEK_SET)

# Read the current PP header entry from the FF LOOKUP table.
header_longs = np.fromfile(
header_longs = _parse_binary_stream(
ff_file,
dtype=">i{0}".format(self._word_depth),
count=pp.NUM_LONG_HEADERS,
Expand All @@ -704,7 +704,7 @@ def _extract_field(self):
if header_longs[0] == _FF_LOOKUP_TABLE_TERMINATE:
# There are no more FF LOOKUP table entries to read.
break
header_floats = np.fromfile(
header_floats = _parse_binary_stream(
ff_file,
dtype=">f{0}".format(self._word_depth),
count=pp.NUM_FLOAT_HEADERS,
Expand Down Expand Up @@ -816,6 +816,40 @@ def __iter__(self):
return pp._interpret_fields(self._extract_field())


def _parse_binary_stream(file_like, dtype=np.float, count=-1):
"""
Replacement :func:`numpy.fromfile` due to python3 performance issues.

Args:

* file_like - Standard python file_like object.

Kwargs:

* dtype - Data type to be parsed out, used to work out bytes read in.

* count - The number of values required to be generated from the parsing.
The default is -1, which will read the entire contexts of the file_like
object and generate as many values as possible.

"""

# There are a wide range of types supported, we just need to know the byte
# size of the object, so we just make sure we've go an instance of a
# np.dtype
if not isinstance(dtype, np.dtype):
dtype = np.dtype(dtype)

# Allocate bytearray for the file to be read into, allowing the numpy array
# to be writable.
_buffer = bytearray(count * dtype.itemsize)
file_like.readinto(_buffer)

# Let numpy do the heavy lifting once we've sorted the file reading.
array = np.frombuffer(_buffer, dtype=dtype, count=-1)
Copy link
Member

@bjlittle bjlittle Aug 19, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@owena11 You're effectively reading the data twice here, right? Once to read the bytes from file, and once to parse the bytes into a numpy array using frombuffer.

I don't have access to a laptop at the moment, but I'm assuming that a file pointer to a stream doesn't support the buffer protocol? I guess I'm just wondering whether there is a way to do this by only streaming through the data once?

Perhaps that's not possible...I don't immediately know 🤔

Also, don't we need to care about byte order here? i.e., endianess

Copy link
Contributor Author

@owena11 owena11 Aug 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if I fully understand the first question here, but as an attempt at an answer anyway.....

We probably don't do two passes over the data with this implementation, It's a bit of an guess that I can go and look up the implementation details of numpy if needed, but the np.frombuffer has all of the information to create the header/object section of a np.array, so the implementation of frombuffer probably can be as simple as creating the array object and pointing the internal reference to the data to the _buffer object we supply as it supports the buffer protocol, so can share the data.

This is guess is supported by the associated flags on the array created:

  C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : False
  WRITEABLE : False
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

The array we create isn't writable because the bytes we passed are immutable. Numpy can then handle the copying if we ever need to modify the data (I don't think we ever do modify the header data in this module).

Happily I can answer more confidently for byte order! Byte order will be encoded within the np.dtype object, defaulting to the machine default. Throughout the calls in this module the dtype description passed specifies endianness which will be preserved when we convert to the np.dtype object.

Copy link
Contributor Author

@owena11 owena11 Aug 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah after thinking about this more the arrays generated definitely aren't being modified prior to a copy, otherwise numpy would throw a ValueError. However there is no performance hit in my timings for using the mutable bytearray rather than simple read so I've push up that as a change.

Would probably prevent annoying gremlins for anyone touching this code in the future, with more expected behavior.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@owena11 Awesome. Okay, let's roll with this... Looks good to me 👍

return array


def load_cubes(filenames, callback, constraints=None):
"""
Loads cubes from a list of fields files filenames.
Expand Down
8 changes: 5 additions & 3 deletions lib/iris/tests/unit/fileformats/ff/test_FF2PP.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,11 @@ def mock_for_extract_field(self, fields, x=None, y=None):
ff2pp._ff_header.grid = mock.Mock(return_value=grid)

open_func = "builtins.open"
with mock.patch("numpy.fromfile", return_value=[0]), mock.patch(
open_func
), mock.patch("struct.unpack_from", return_value=[4]), mock.patch(
with mock.patch(
"iris.fileformats._ff._parse_binary_stream", return_value=[0]
), mock.patch(open_func), mock.patch(
"struct.unpack_from", return_value=[4]
), mock.patch(
"iris.fileformats.pp.make_pp_field", side_effect=fields
), mock.patch(
"iris.fileformats._ff.FF2PP._payload", return_value=(0, 0)
Expand Down