Skip to content

Commit 4b108d3

Browse files
Refactor ImageSource <-> RelionSource, and change Simulation accordingly to populate its metadata
1 parent 27ea010 commit 4b108d3

File tree

3 files changed

+174
-135
lines changed

3 files changed

+174
-135
lines changed

src/aspire/source/image.py

Lines changed: 2 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -45,48 +45,10 @@ class ImageSource:
4545
`Filter` objects, for example, are stored in this metadata table as references to unique `Filter` objects that
4646
correspond to images in this `ImageSource`. Several rows of metadata may end up containing a reference to a small
4747
handful of unique `Filter` objects, depending on the values found in other columns (identical `Filter`
48-
objects, depending on unique CTF values found for _rlnDefocusU/_rlnDefocusV etc.
48+
objects). For example, a smaller number of CTFFilter objects may apply to subsets of particles depending on
49+
the unique "_rlnDefocusU"/"_rlnDefocusV" Relion parameters.
4950
"""
5051

51-
# The metadata_fields dictionary below specifies default data types of certain key fields used in the codebase.
52-
# The STAR file used to initialize subclasses of ImageSource may well contain other columns not found below; these
53-
# additional columns are available when read, and they default to the pandas data type 'object'.
54-
55-
metadata_fields = {
56-
"_rlnVoltage": float,
57-
"_rlnDefocusU": float,
58-
"_rlnDefocusV": float,
59-
"_rlnDefocusAngle": float,
60-
"_rlnSphericalAberration": float,
61-
"_rlnDetectorPixelSize": float,
62-
"_rlnCtfFigureOfMerit": float,
63-
"_rlnMagnification": float,
64-
"_rlnAmplitudeContrast": float,
65-
"_rlnImageName": str,
66-
"_rlnOriginalName": str,
67-
"_rlnCtfImage": str,
68-
"_rlnCoordinateX": float,
69-
"_rlnCoordinateY": float,
70-
"_rlnCoordinateZ": float,
71-
"_rlnNormCorrection": float,
72-
"_rlnMicrographName": str,
73-
"_rlnGroupName": str,
74-
"_rlnGroupNumber": str,
75-
"_rlnOriginX": float,
76-
"_rlnOriginY": float,
77-
"_rlnAngleRot": float,
78-
"_rlnAngleTilt": float,
79-
"_rlnAnglePsi": float,
80-
"_rlnClassNumber": int,
81-
"_rlnLogLikeliContribution": float,
82-
"_rlnRandomSubset": int,
83-
"_rlnParticleName": str,
84-
"_rlnOriginalParticleName": str,
85-
"_rlnNrOfSignificantSamples": float,
86-
"_rlnNrOfFrames": int,
87-
"_rlnMaxValueProbDistribution": float,
88-
}
89-
9052
def __init__(self, L, n, dtype="double", metadata=None, memory=None):
9153
"""
9254
A Cryo-EM ImageSource object that supplies images along with other parameters for image manipulation.
@@ -143,34 +105,6 @@ def filter_indices(self):
143105
@filter_indices.setter
144106
def filter_indices(self, indices):
145107
# create metadata of filters for all images
146-
if indices is None:
147-
filter_values = np.nan
148-
else:
149-
attribute_list = (
150-
"voltage",
151-
"defocus_u",
152-
"defocus_v",
153-
"defocus_ang",
154-
"Cs",
155-
"alpha",
156-
)
157-
filter_values = np.zeros((len(indices), len(attribute_list)))
158-
for i, filt in enumerate(self.unique_filters):
159-
filter_values[indices == i] = [
160-
getattr(filt, attribute, np.nan) for attribute in attribute_list
161-
]
162-
163-
self.set_metadata(
164-
[
165-
"_rlnVoltage",
166-
"_rlnDefocusU",
167-
"_rlnDefocusV",
168-
"_rlnDefocusAngle",
169-
"_rlnSphericalAberration",
170-
"_rlnAmplitudeContrast",
171-
],
172-
filter_values,
173-
)
174108
return self.set_metadata(["__filter_indices"], indices)
175109

176110
@property

src/aspire/source/relion.py

Lines changed: 136 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -8,44 +8,60 @@
88
import pandas as pd
99

1010
from aspire.image import Image
11-
from aspire.operators import CTFFilter
11+
from aspire.operators import CTFFilter, IdentityFilter
1212
from aspire.source import ImageSource
1313
from aspire.storage import StarFile
14-
from aspire.utils import ensure
1514

1615
logger = logging.getLogger(__name__)
1716

1817

1918
class RelionSource(ImageSource):
20-
@classmethod
21-
def starfile2df(cls, filepath, data_folder=None, max_rows=None):
22-
if data_folder is not None:
23-
if not os.path.isabs(data_folder):
24-
data_folder = os.path.join(os.path.dirname(filepath), data_folder)
25-
else:
26-
data_folder = os.path.dirname(filepath)
27-
28-
# Note: Valid Relion image "_data.star" files have to have their data in the first loop of the first block.
29-
# We are getting the first (and only) block in this StarFile object
30-
df = StarFile(filepath).get_block_by_index(0)
31-
column_types = {name: cls.metadata_fields.get(name, str) for name in df.columns}
32-
df = df.astype(column_types)
33-
34-
df[["__mrc_index", "__mrc_filename"]] = df["_rlnImageName"].str.split(
35-
"@", 1, expand=True
36-
)
37-
df["__mrc_index"] = pd.to_numeric(df["__mrc_index"])
38-
39-
# Adding a full-filepath field to the Dataframe helps us save time later
40-
# Note that os.path.join works as expected when the second argument is an absolute path itself
41-
df["__mrc_filepath"] = df["__mrc_filename"].apply(
42-
lambda filename: os.path.join(data_folder, filename)
43-
)
44-
45-
if max_rows is None:
46-
return df
47-
else:
48-
return df.iloc[:max_rows]
19+
"""
20+
A RelionSource represents a source of picked and cropped particles stored as slices in a `.mrcs` stack.
21+
It must be instantiated via a STAR file, which--at a minumum--lists the particles in each `.mrcs` stack in the
22+
`_rlnImageName` column. The STAR file may also contain Relion-specific metadata columns. This information
23+
is read into a Pandas DataFrame table containing a row for each particle specifying its location and
24+
its metadata. The metadata table may be augmented or modified via helper methods found in ImageSource. It may
25+
store, for example, Filter objects added during preprocessing.
26+
"""
27+
28+
# The metadata_fields dictionary below specifies default data types
29+
# of certain key fields used in the codebase,
30+
# which are originally read from Relion STAR files.
31+
relion_metadata_fields = {
32+
"_rlnVoltage": float,
33+
"_rlnDefocusU": float,
34+
"_rlnDefocusV": float,
35+
"_rlnDefocusAngle": float,
36+
"_rlnSphericalAberration": float,
37+
"_rlnDetectorPixelSize": float,
38+
"_rlnCtfFigureOfMerit": float,
39+
"_rlnMagnification": float,
40+
"_rlnAmplitudeContrast": float,
41+
"_rlnImageName": str,
42+
"_rlnOriginalName": str,
43+
"_rlnCtfImage": str,
44+
"_rlnCoordinateX": float,
45+
"_rlnCoordinateY": float,
46+
"_rlnCoordinateZ": float,
47+
"_rlnNormCorrection": float,
48+
"_rlnMicrographName": str,
49+
"_rlnGroupName": str,
50+
"_rlnGroupNumber": str,
51+
"_rlnOriginX": float,
52+
"_rlnOriginY": float,
53+
"_rlnAngleRot": float,
54+
"_rlnAngleTilt": float,
55+
"_rlnAnglePsi": float,
56+
"_rlnClassNumber": int,
57+
"_rlnLogLikeliContribution": float,
58+
"_rlnRandomSubset": int,
59+
"_rlnParticleName": str,
60+
"_rlnOriginalParticleName": str,
61+
"_rlnNrOfSignificantSamples": float,
62+
"_rlnNrOfFrames": int,
63+
"_rlnMaxValueProbDistribution": float,
64+
}
4965

5066
def __init__(
5167
self,
@@ -77,7 +93,7 @@ def __init__(
7793
self.B = B
7894
self.n_workers = n_workers
7995

80-
metadata = self.__class__.starfile2df(filepath, data_folder, max_rows)
96+
metadata = self.populate_metadata(filepath, data_folder, max_rows)
8197

8298
n = len(metadata)
8399
if n == 0:
@@ -90,10 +106,10 @@ def __init__(
90106
# Get the 'mode' (data type) - TODO: There's probably a more direct way to do this.
91107
mode = int(mrc.header.mode)
92108
dtypes = {0: "int8", 1: "int16", 2: "float32", 6: "uint16"}
93-
ensure(
94-
mode in dtypes,
95-
f"Only modes={list(dtypes.keys())} in MRC files are supported for now.",
96-
)
109+
assert (
110+
mode in dtypes
111+
), f"Only modes={list(dtypes.keys())} in MRC files are supported for now."
112+
97113
dtype = dtypes[mode]
98114

99115
shape = mrc.data.shape
@@ -103,48 +119,100 @@ def __init__(
103119
if len(shape) == 2:
104120
shape = (1,) + shape
105121

106-
ensure(shape[1] == shape[2], "Only square images are supported")
122+
assert shape[1] == shape[2], "Only square images are supported"
107123
L = shape[1]
108124
logger.debug(f"Image size = {L}x{L}")
109125

110126
# Save original image resolution that we expect to use when we start reading actual data
111127
self._original_resolution = L
112128

113-
filter_params, filter_indices = np.unique(
114-
metadata[
115-
[
116-
"_rlnVoltage",
117-
"_rlnDefocusU",
118-
"_rlnDefocusV",
119-
"_rlnDefocusAngle",
120-
"_rlnSphericalAberration",
121-
"_rlnAmplitudeContrast",
122-
]
123-
].values,
124-
return_inverse=True,
125-
axis=0,
129+
ImageSource.__init__(
130+
self, L=L, n=n, dtype=dtype, metadata=metadata, memory=memory
126131
)
127132

128-
filters = []
129-
for row in filter_params:
130-
filters.append(
131-
CTFFilter(
132-
pixel_size=self.pixel_size,
133-
voltage=row[0],
134-
defocus_u=row[1],
135-
defocus_v=row[2],
136-
defocus_ang=row[3] * np.pi / 180, # degrees to radians
137-
Cs=row[4],
138-
alpha=row[5],
139-
B=B,
140-
)
133+
# CTF estimation parameters coming from Relion
134+
CTF_params = [
135+
"_rlnVoltage",
136+
"_rlnDefocusU",
137+
"_rlnDefocusV",
138+
"_rlnDefocusAngle",
139+
"_rlnSphericalAberration",
140+
"_rlnAmplitudeContrast",
141+
]
142+
# If these exist in the STAR file, we may create CTF filters for the source
143+
if set(CTF_params).issubset(metadata.columns):
144+
# partition particles according to unique CTF parameters
145+
filter_params, filter_indices = np.unique(
146+
metadata[CTF_params].values, return_inverse=True, axis=0
141147
)
148+
filters = []
149+
# for each unique CTF configuration, create a CTFFilter object
150+
for row in filter_params:
151+
filters.append(
152+
CTFFilter(
153+
pixel_size=self.pixel_size,
154+
voltage=row[0],
155+
defocus_u=row[1],
156+
defocus_v=row[2],
157+
defocus_ang=row[3] * np.pi / 180, # degrees to radians
158+
Cs=row[4],
159+
alpha=row[5],
160+
B=B,
161+
)
162+
)
163+
self.unique_filters = filters
164+
# filter_indices stores, for each particle index, the index in
165+
# self.unique_filters of the filter that should be applied
166+
self.filter_indices = filter_indices
167+
# If no CTF info in STAR, we initialize the filter values of metadata with default values
168+
else:
169+
self.unique_filters = [IdentityFilter()]
170+
self.filter_indices = np.zeros(self.n, dtype=int)
142171

143-
ImageSource.__init__(
144-
self, L=L, n=n, dtype=dtype, metadata=metadata, memory=memory
172+
def populate_metadata(self, filepath, data_folder=None, max_rows=None):
173+
"""
174+
Relion STAR files may contain a large number of metadata columns in addition
175+
to the locations of particles. We read this into a Pandas DataFrame and add some of
176+
our own columns for convenience.
177+
"""
178+
if data_folder is not None:
179+
if not os.path.isabs(data_folder):
180+
data_folder = os.path.join(os.path.dirname(filepath), data_folder)
181+
else:
182+
data_folder = os.path.dirname(filepath)
183+
184+
# Valid Relion STAR files always have their data in the first loop of the first block.
185+
# We are getting the first (and only) block in this StarFile object
186+
df = StarFile(filepath).get_block_by_index(0)
187+
# convert STAR file strings to data type for each field
188+
# columns without a specified data type are read as dtype=object
189+
column_types = {
190+
name: RelionSource.relion_metadata_fields.get(name, str)
191+
for name in df.columns
192+
}
193+
df = df.astype(column_types)
194+
195+
# particle locations are stored as e.g. '000001@first_micrograph.mrcs'
196+
# in the _rlnImageName column. here, we're splitting this information
197+
# so we can get the particle's index in the .mrcs stack as an int
198+
df[["__mrc_index", "__mrc_filename"]] = df["_rlnImageName"].str.split(
199+
"@", 1, expand=True
145200
)
146-
self.unique_filters = filters
147-
self.filter_indices = filter_indices
201+
# __mrc_index corresponds to the integer index of the particle in the __mrc_filename stack
202+
# Note that this is 1-based indexing
203+
df["__mrc_index"] = pd.to_numeric(df["__mrc_index"])
204+
205+
# Adding a full-filepath field to the Dataframe helps us save time later
206+
# Note that os.path.join works as expected when the second argument is an absolute path itself
207+
df["__mrc_filepath"] = df["__mrc_filename"].apply(
208+
lambda filename: os.path.join(data_folder, filename)
209+
)
210+
211+
if max_rows is None:
212+
return df
213+
else:
214+
max_rows = min(max_rows, len(df))
215+
return df.iloc[:max_rows]
148216

149217
def __str__(self):
150218
return f"RelionSource ({self.n} images of size {self.L}x{self.L})"
@@ -162,6 +230,7 @@ def load_single_mrcs(filepath, df):
162230
# the code below reshapes it to (1, resolution, resolution)
163231
if len(arr.shape) == 2:
164232
arr = arr.reshape((1,) + arr.shape)
233+
# __mrc_index is the 1-based index of the particle in the stack
165234
data = arr[df["__mrc_index"] - 1, :, :]
166235

167236
return df.index, data

src/aspire/source/simulation.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def __init__(
8686
if unique_filters:
8787
if filter_indices is None:
8888
filter_indices = randi(len(unique_filters), n, seed=seed) - 1
89+
self._populate_ctf_metadata(filter_indices)
8990
self.filter_indices = filter_indices
9091

9192
self.offsets = offsets
@@ -96,6 +97,41 @@ def __init__(
9697
logger.info("Appending a NoiseAdder to generation pipeline")
9798
self.noise_adder = NoiseAdder(seed=self.seed, noise_filter=noise_filter)
9899

100+
def _populate_ctf_metadata(self, filter_indices):
101+
# Since we are not reading from a starfile, we must construct
102+
# metadata based on the CTF filters by hand and set the values
103+
# for these columns
104+
#
105+
# class attributes of CTFFilter:
106+
CTFFilter_attributes = (
107+
"voltage",
108+
"defocus_u",
109+
"defocus_v",
110+
"defocus_ang",
111+
"Cs",
112+
"alpha",
113+
)
114+
# get the CTF parameters, if they exist, for each filter
115+
# and for each image (indexed by filter_indices)
116+
filter_values = np.zeros((len(filter_indices), len(CTFFilter_attributes)))
117+
for i, filt in enumerate(self.unique_filters):
118+
filter_values[filter_indices == i] = [
119+
getattr(filt, att, np.nan) for att in CTFFilter_attributes
120+
]
121+
# set the corresponding Relion metadata values that we would expect
122+
# from a STAR file
123+
self.set_metadata(
124+
[
125+
"_rlnVoltage",
126+
"_rlnDefocusU",
127+
"_rlnDefocusV",
128+
"_rlnDefocusAngle",
129+
"_rlnSphericalAberration",
130+
"_rlnAmplitudeContrast",
131+
],
132+
filter_values,
133+
)
134+
99135
def _gaussian_blob_vols(self, L=8, C=2, K=16, alpha=1):
100136
"""
101137
Generate Gaussian blob volumes

0 commit comments

Comments
 (0)