From 6e2ff4e81ccadb02e351480fd9fa837cff8e98b1 Mon Sep 17 00:00:00 2001 From: ARF Date: Thu, 7 Apr 2016 15:03:18 +0200 Subject: [PATCH 1/7] __shapeIndex optimization: removed superfluous tell() x2.0 speedup over master --- shapefile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shapefile.py b/shapefile.py index d23f393f..e950dc73 100644 --- a/shapefile.py +++ b/shapefile.py @@ -392,7 +392,7 @@ def __shapeIndex(self, i=None): for r in range(numRecords): # Offsets are 16-bit words just like the file length self._offsets.append(unpack(">i", shx.read(4))[0] * 2) - shx.seek(shx.tell() + 4) + shx.seek(4, 1) if not i == None: return self._offsets[i] From 6e26248003b365847acf653f0709af6a9c79bbfe Mon Sep 17 00:00:00 2001 From: ARF Date: Thu, 7 Apr 2016 16:13:50 +0200 Subject: [PATCH 2/7] __shapeIndex optimization: remove seek() call x1.7 speedup over previous commit --- shapefile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/shapefile.py b/shapefile.py index e950dc73..cf66c8fb 100644 --- a/shapefile.py +++ b/shapefile.py @@ -391,8 +391,7 @@ def __shapeIndex(self, i=None): shx.seek(100) for r in range(numRecords): # Offsets are 16-bit words just like the file length - self._offsets.append(unpack(">i", shx.read(4))[0] * 2) - shx.seek(4, 1) + self._offsets.append(unpack('>i4x', shx.read(8))[0] * 2) if not i == None: return self._offsets[i] From 8f671db0cded123c223c85ad2baf6d998f4e25a1 Mon Sep 17 00:00:00 2001 From: ARF Date: Thu, 7 Apr 2016 16:21:48 +0200 Subject: [PATCH 3/7] __shapeIndex optimization: use list comprehension x1.3 speedup over previous commit --- shapefile.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/shapefile.py b/shapefile.py index cf66c8fb..721c9126 100644 --- a/shapefile.py +++ b/shapefile.py @@ -389,9 +389,8 @@ def __shapeIndex(self, i=None): numRecords = shxRecordLength // 8 # Jump to the first record. shx.seek(100) - for r in range(numRecords): - # Offsets are 16-bit words just like the file length - self._offsets.append(unpack('>i4x', shx.read(8))[0] * 2) + # Offsets are 16-bit words just like the file length + self._offsets = [unpack('>i4x', shx.read(8))[0] * 2 for r in range(numRecords)] if not i == None: return self._offsets[i] From efafff8250d944e4e7aced120f5e511e1925064c Mon Sep 17 00:00:00 2001 From: ARF Date: Thu, 7 Apr 2016 16:54:50 +0200 Subject: [PATCH 4/7] __shapeIndex optimization: read into array and discard unneeded elements x1.7 speedup over previous commit --- shapefile.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/shapefile.py b/shapefile.py index 721c9126..010b54a3 100644 --- a/shapefile.py +++ b/shapefile.py @@ -390,7 +390,11 @@ def __shapeIndex(self, i=None): # Jump to the first record. shx.seek(100) # Offsets are 16-bit words just like the file length - self._offsets = [unpack('>i4x', shx.read(8))[0] * 2 for r in range(numRecords)] + self._offsets = [2*el for el in + array.array('i', unpack(">%si" % (numRecords*2), + shx.read(4 * numRecords*2)) + )[::2] + ] if not i == None: return self._offsets[i] From 4c0b9b4603851e689287f7708b151436ba89a972 Mon Sep 17 00:00:00 2001 From: ARF Date: Thu, 7 Apr 2016 17:41:10 +0200 Subject: [PATCH 5/7] __shapeIndex optimization: explicit format string excluding unneeded elements x1.5 speedup over previous commit Due to unexplained reasons, this seems to significantly speed up the apparently untouched read(). --- shapefile.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/shapefile.py b/shapefile.py index 010b54a3..8f058fb9 100644 --- a/shapefile.py +++ b/shapefile.py @@ -389,12 +389,10 @@ def __shapeIndex(self, i=None): numRecords = shxRecordLength // 8 # Jump to the first record. shx.seek(100) + shxRecords = array.array('i', unpack(">" + "i4x" * numRecords, + shx.read((4+4) * numRecords))) # Offsets are 16-bit words just like the file length - self._offsets = [2*el for el in - array.array('i', unpack(">%si" % (numRecords*2), - shx.read(4 * numRecords*2)) - )[::2] - ] + self._offsets = [2*el for el in shxRecords] if not i == None: return self._offsets[i] From eec5efe061166da688262c4742cda3d43dbc5c51 Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 8 Apr 2016 07:26:31 +0200 Subject: [PATCH 6/7] __shapeIndex optimization: use memoryview slicing to skip unused field x2.0 speedup over previous commit --- shapefile.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/shapefile.py b/shapefile.py index 8f058fb9..cfba5223 100644 --- a/shapefile.py +++ b/shapefile.py @@ -18,6 +18,11 @@ import tempfile import itertools +try: + memoryview(b'') +except NameError: + memoryview = lambda x: x + # # Constants for shape types NULL = 0 @@ -389,10 +394,12 @@ def __shapeIndex(self, i=None): numRecords = shxRecordLength // 8 # Jump to the first record. shx.seek(100) - shxRecords = array.array('i', unpack(">" + "i4x" * numRecords, - shx.read((4+4) * numRecords))) + shxRecords = array.array('i') + shxRecords.fromfile(shx, 2 * numRecords) + if sys.byteorder != 'big': + shxRecords.byteswap() # Offsets are 16-bit words just like the file length - self._offsets = [2*el for el in shxRecords] + self._offsets = [2 * el for el in memoryview(shxRecords)[::2]] if not i == None: return self._offsets[i] From fb25a045b44e70aea9d77572e543c497e990b396 Mon Sep 17 00:00:00 2001 From: ARF Date: Sat, 9 Apr 2016 14:32:31 +0200 Subject: [PATCH 7/7] __shapeIndex optimization: use numpy when available for array arithmetic x378 speedup over master with numpy available x22 speedup over master without numpy --- shapefile.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/shapefile.py b/shapefile.py index cfba5223..bd3be309 100644 --- a/shapefile.py +++ b/shapefile.py @@ -18,6 +18,12 @@ import tempfile import itertools +try: + import numpy + has_numpy = True +except ImportError: + has_numpy = False + try: memoryview(b'') except NameError: @@ -227,7 +233,7 @@ def __init__(self, *args, **kwargs): self.shx = None self.dbf = None self.shapeName = "Not specified" - self._offsets = [] + self._offsets = None self.shpLength = None self.numRecords = None self.fields = [] @@ -387,19 +393,22 @@ def __shapeIndex(self, i=None): shx = self.shx if not shx: return None - if not self._offsets: + if self._offsets is None: # File length (16-bit word * 2 = bytes) - header length shx.seek(24) shxRecordLength = (unpack(">i", shx.read(4))[0] * 2) - 100 numRecords = shxRecordLength // 8 # Jump to the first record. shx.seek(100) - shxRecords = array.array('i') - shxRecords.fromfile(shx, 2 * numRecords) - if sys.byteorder != 'big': - shxRecords.byteswap() # Offsets are 16-bit words just like the file length - self._offsets = [2 * el for el in memoryview(shxRecords)[::2]] + if has_numpy: + self._offsets = numpy.fromfile(shx, '>i4', 2 * numRecords)[::2] * 2 + else: + shxRecords = array.array('i') + shxRecords.fromfile(shx, 2 * numRecords) + if sys.byteorder != 'big': + shxRecords.byteswap() + self._offsets = [2 * el for el in memoryview(shxRecords)[::2]] if not i == None: return self._offsets[i]