Skip to content

Commit efe027f

Browse files
authored
Merge pull request #911 from ComputationalCryoEM/vb/pdgone
Vb/pdgone
2 parents 010ad33 + 5c089b7 commit efe027f

13 files changed

+165
-163
lines changed

environment-accelerate.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ dependencies:
1212
- pip
1313
- python=3.8
1414
- numpy=1.23.5
15-
- pandas=1.3.5
1615
- scipy=1.9.3
1716
- scikit-learn
1817
- scikit-image

environment-default.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ dependencies:
1212
- pip
1313
- python=3.8
1414
- numpy=1.23.5
15-
- pandas=1.3.5
1615
- scipy=1.9.3
1716
- scikit-learn
1817
- scikit-image

environment-intel.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ dependencies:
1212
- pip
1313
- python=3.8
1414
- numpy=1.23.5
15-
- pandas=1.3.5
1615
- scipy=1.9.3
1716
- scikit-learn
1817
- scikit-image

environment-openblas.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ dependencies:
1212
- pip
1313
- python=3.8
1414
- numpy=1.23.5
15-
- pandas=1.3.5
1615
- scipy=1.9.3
1716
- scikit-learn
1817
- scikit-image

setup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ def read(fname):
3333
"mrcfile",
3434
"numpy>=1.21.5",
3535
"packaging",
36-
"pandas>=1.3.5",
3736
"psutil",
3837
"pyfftw",
3938
"PyWavelets",

src/aspire/ctf/ctf_estimator.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import mrcfile
1313
import numpy as np
1414
from numpy import linalg as npla
15-
from pandas import DataFrame
1615
from scipy.optimize import linprog
1716
from scipy.signal.windows import dpss
1817

@@ -693,9 +692,8 @@ def write_star(self, name, params_dict, output_dir):
693692
data_block["_rlnAmplitudeContrast"] = params_dict["amplitude_contrast"]
694693
data_block["_rlnVoltage"] = params_dict["voltage"]
695694
data_block["_rlnMicrographPixelSize"] = params_dict["pixel_size"]
696-
df = DataFrame([data_block])
697695
blocks = OrderedDict()
698-
blocks["root"] = df
696+
blocks["root"] = data_block
699697
star = StarFile(blocks=blocks)
700698
star.write(os.path.join(output_dir, os.path.splitext(name)[0]) + ".star")
701699

src/aspire/source/coordinates.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import mrcfile
99
import numpy as np
10-
import pandas as pd
1110

1211
from aspire.image import Image
1312
from aspire.operators import CTFFilter, IdentityFilter
@@ -226,8 +225,9 @@ def _coords_list_from_star(self, star_file):
226225
return a list of coordinates in box format.
227226
:param star_file: A path to a STAR file containing particle centers
228227
"""
229-
df = StarFile(star_file).get_block_by_index(0).astype(float)
230-
coords = list(zip(df["_rlnCoordinateX"], df["_rlnCoordinateY"]))
228+
data_block = StarFile(star_file).get_block_by_index(0)
229+
coords = list(zip(data_block["_rlnCoordinateX"], data_block["_rlnCoordinateY"]))
230+
coords = [(float(x), float(y)) for x, y in coords]
231231
return [
232232
self._box_coord_from_center(coord, self.particle_size) for coord in coords
233233
]
@@ -318,16 +318,16 @@ def import_aspire_ctf(self, ctf):
318318
"Number of CTF STAR files must match number of micrographs."
319319
)
320320

321-
# merge DataFrames from CTF files
322-
dfs = []
321+
# merge dicts from CTF files
322+
data_blocks = defaultdict(list)
323323
for f in ctf:
324324
# ASPIRE's CTF Estimator produces legacy (=< 3.0) STAR files containing one row
325325
star = RelionStarFile(f)
326-
dfs.append(star.data_block)
326+
data_block = star.data_block
327+
for k, v in data_block.items():
328+
data_blocks[k].append(v)
327329

328-
df = pd.concat(dfs, ignore_index=True)
329-
330-
self._extract_ctf(df)
330+
self._extract_ctf(data_blocks)
331331

332332
def import_relion_ctf(self, ctf):
333333
"""
@@ -339,18 +339,18 @@ def import_relion_ctf(self, ctf):
339339
"""
340340
data_block = RelionStarFile(ctf).get_merged_data_block()
341341

342-
# data_block is a pandas Dataframe containing the micrographs
343-
if not len(data_block) == self.num_micrographs:
342+
# data_block is a dict containing the micrographs
343+
if not len(list(data_block.values())[0]) == self.num_micrographs:
344344
raise ValueError(
345345
f"{ctf} has CTF information for {len(data_block)}",
346346
f" micrographs but this source has {self.num_micrographs} micrographs.",
347347
)
348348

349349
self._extract_ctf(data_block)
350350

351-
def _extract_ctf(self, df):
351+
def _extract_ctf(self, data_block):
352352
"""
353-
Receives a flattened DataFrame containing micrograph CTF information, and populates
353+
Receives a dict containing micrograph CTF information, and populates
354354
the Source's CTF Filters, filter indices, and metadata.
355355
"""
356356
# required CTF params excluding pixel size
@@ -366,8 +366,11 @@ def _extract_ctf(self, df):
366366

367367
# get unique ctfs from the data block
368368
# i'th entry of `indices` contains the index of `filter_params` with corresponding CTF params
369+
ctf_data = np.stack(data_block[c] for c in CTF_params).astype(self.dtype).T
369370
filter_params, indices = np.unique(
370-
df[CTF_params].astype(self.dtype).values, return_inverse=True, axis=0
371+
ctf_data,
372+
return_inverse=True,
373+
axis=0,
371374
)
372375

373376
# convert defocus_ang from degrees to radians
@@ -643,16 +646,16 @@ def _validate_starfile(self, coord_file):
643646
"""
644647
Ensures that a STAR file contains numeric particle centers.
645648
"""
646-
df = StarFile(coord_file).get_block_by_index(0)
649+
data_block = StarFile(coord_file).get_block_by_index(0)
647650
# We're looking for specific columns for the X and Y coordinates
648-
if not all(col in df.columns for col in ["_rlnCoordinateX", "_rlnCoordinateY"]):
651+
if not all(col in data_block for col in ["_rlnCoordinateX", "_rlnCoordinateY"]):
649652
logger.error(f"Problem with coordinate file: {coord_file}")
650653
raise ValueError(
651654
"STAR file does not contain _rlnCoordinateX, _rlnCoordinateY columns."
652655
)
653656
# check that all values in each column are numeric
654657
if not all(
655-
all(df[col].apply(self._is_number))
658+
all(map(self._is_number, data_block[col]))
656659
for col in ["_rlnCoordinateX", "_rlnCoordinateY"]
657660
):
658661
logger.error(f"Problem with coordinate file: {coord_file}")

src/aspire/source/relion.py

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import mrcfile
77
import numpy as np
8-
import pandas as pd
98

109
from aspire.image import Image
1110
from aspire.operators import CTFFilter, IdentityFilter
@@ -20,7 +19,7 @@ class RelionSource(ImageSource):
2019
A RelionSource represents a source of picked and cropped particles stored as slices in a `.mrcs` stack.
2120
It must be instantiated via a STAR file, which--at a minumum--lists the particles in each `.mrcs` stack in the
2221
`_rlnImageName` column. The STAR file may also contain Relion-specific metadata columns. This information
23-
is read into a Pandas DataFrame table containing a row for each particle specifying its location and
22+
is read into dictionaries containing rows for each particle specifying its location and
2423
its metadata. The metadata table may be augmented or modified via helper methods found in ImageSource. It may
2524
store, for example, Filter objects added during preprocessing.
2625
"""
@@ -61,12 +60,12 @@ def __init__(
6160

6261
metadata = self.populate_metadata()
6362

64-
n = len(metadata)
63+
n = len(metadata["__mrc_filepath"])
6564
if n == 0:
6665
raise RuntimeError("No mrcs files found for starfile!")
6766

6867
# Peek into the first image and populate some attributes
69-
first_mrc_filepath = metadata.loc[0]["__mrc_filepath"]
68+
first_mrc_filepath = metadata["__mrc_filepath"][0]
7069
mrc = mrcfile.open(first_mrc_filepath)
7170

7271
# Get the 'mode' (data type) - TODO: There's probably a more direct way to do this.
@@ -106,10 +105,13 @@ def __init__(
106105
"_rlnAmplitudeContrast",
107106
]
108107
# If these all exist in the STAR file, we may create CTF filters for the source
109-
if set(CTF_params).issubset(metadata.columns):
108+
if set(CTF_params).issubset(metadata.keys()):
110109
# partition particles according to unique CTF parameters
110+
ctf_data = np.stack(metadata[k] for k in CTF_params).T
111111
filter_params, filter_indices = np.unique(
112-
metadata[CTF_params].values, return_inverse=True, axis=0
112+
ctf_data,
113+
return_inverse=True,
114+
axis=0,
113115
)
114116
filters = []
115117
# for each unique CTF configuration, create a CTFFilter object
@@ -132,7 +134,7 @@ def __init__(
132134
self.filter_indices = filter_indices
133135

134136
# We have provided some, but not all the required params
135-
elif any(param in metadata.columns for param in CTF_params):
137+
elif any(param in metadata for param in CTF_params):
136138
logger.warning(
137139
f"Found partially populated CTF Params."
138140
f" To automatically populate CTFFilters provide {CTF_params}"
@@ -151,7 +153,7 @@ def __init__(
151153
def populate_metadata(self):
152154
"""
153155
Relion STAR files may contain a large number of metadata columns in addition
154-
to the locations of particles. We read this into a Pandas DataFrame and add some of
156+
to the locations of particles. We read this into a dict and add some of
155157
our own columns for convenience.
156158
"""
157159
if self.data_folder is not None:
@@ -167,25 +169,24 @@ def populate_metadata(self):
167169
# particle locations are stored as e.g. '000001@first_micrograph.mrcs'
168170
# in the _rlnImageName column. here, we're splitting this information
169171
# so we can get the particle's index in the .mrcs stack as an int
170-
metadata[["__mrc_index", "__mrc_filename"]] = metadata[
171-
"_rlnImageName"
172-
].str.split("@", n=1, expand=True)
172+
indices_filenames = [s.split("@") for s in metadata["_rlnImageName"]]
173173
# __mrc_index corresponds to the integer index of the particle in the __mrc_filename stack
174174
# Note that this is 1-based indexing
175-
metadata["__mrc_index"] = pd.to_numeric(metadata["__mrc_index"])
175+
metadata["__mrc_index"] = np.array([int(s[0]) for s in indices_filenames])
176+
metadata["__mrc_filename"] = np.array([s[1] for s in indices_filenames])
176177

177178
# Adding a full-filepath field to the Dataframe helps us save time later
178179
# Note that os.path.join works as expected when the second argument is an absolute path itself
179-
metadata["__mrc_filepath"] = metadata["__mrc_filename"].apply(
180-
lambda filename: os.path.join(self.data_folder, filename)
180+
metadata["__mrc_filepath"] = np.array(
181+
[os.path.join(self.data_folder, p) for p in metadata["__mrc_filename"]]
181182
)
182183

183184
# finally, chop off the metadata df at max_rows
184185
if self.max_rows is None:
185186
return metadata
186187
else:
187-
max_rows = min(self.max_rows, len(metadata))
188-
return metadata.iloc[:max_rows]
188+
max_rows = min(self.max_rows, len(metadata["__mrc_filepath"]))
189+
return {k: v[:max_rows] for k, v in metadata.items()}
189190

190191
def __str__(self):
191192
return f"RelionSource ({self.n} images of size {self.L}x{self.L})"
@@ -209,34 +210,38 @@ def _images(self, indices):
209210
# Log the indices in case needed to debug a crash
210211
logger.debug(f"Indices: {indices}")
211212

212-
def load_single_mrcs(filepath, df):
213+
def load_single_mrcs(filepath, indices):
213214
arr = mrcfile.open(filepath).data
214215
# if the stack only contains one image, arr will have shape (resolution, resolution)
215216
# the code below reshapes it to (1, resolution, resolution)
216217
if len(arr.shape) == 2:
217218
arr = arr.reshape((1,) + arr.shape)
218219
# __mrc_index is the 1-based index of the particle in the stack
219-
data = arr[df["__mrc_index"] - 1, :, :]
220+
data = arr[self._metadata["__mrc_index"][indices] - 1, :, :]
220221

221-
return df.index, data
222+
return indices, data
222223

223224
n_workers = self.n_workers
224225
if n_workers < 0:
225226
n_workers = cpu_count() - 1
226227

227-
df = self._metadata.loc[indices]
228228
im = np.empty(
229229
(len(indices), self._original_resolution, self._original_resolution),
230230
dtype=self.dtype,
231231
)
232232

233-
groups = df.groupby("__mrc_filepath")
234-
n_workers = min(n_workers, len(groups))
233+
filepaths, filepath_indices = np.unique(
234+
self._metadata["__mrc_filepath"], return_inverse=True
235+
)
236+
n_workers = min(n_workers, len(filepaths))
235237

236238
with futures.ThreadPoolExecutor(n_workers) as executor:
237239
to_do = []
238-
for filepath, _df in groups:
239-
future = executor.submit(load_single_mrcs, filepath, _df)
240+
for i, filepath in enumerate(filepaths):
241+
this_filepath_indices = np.where(filepath_indices == i)[0]
242+
future = executor.submit(
243+
load_single_mrcs, filepath, this_filepath_indices
244+
)
240245
to_do.append(future)
241246

242247
for future in futures.as_completed(to_do):

0 commit comments

Comments
 (0)