Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 2 additions & 68 deletions src/aspire/source/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,48 +44,10 @@ class ImageSource:
`Filter` objects, for example, are stored in this metadata table as references to unique `Filter` objects that
correspond to images in this `ImageSource`. Several rows of metadata may end up containing a reference to a small
handful of unique `Filter` objects, depending on the values found in other columns (identical `Filter`
objects, depending on unique CTF values found for _rlnDefocusU/_rlnDefocusV etc.
objects). For example, a smaller number of CTFFilter objects may apply to subsets of particles depending on
the unique "_rlnDefocusU"/"_rlnDefocusV" Relion parameters.
"""

# The metadata_fields dictionary below specifies default data types of certain key fields used in the codebase.
# The STAR file used to initialize subclasses of ImageSource may well contain other columns not found below; these
# additional columns are available when read, and they default to the pandas data type 'object'.

metadata_fields = {
"_rlnVoltage": float,
"_rlnDefocusU": float,
"_rlnDefocusV": float,
"_rlnDefocusAngle": float,
"_rlnSphericalAberration": float,
"_rlnDetectorPixelSize": float,
"_rlnCtfFigureOfMerit": float,
"_rlnMagnification": float,
"_rlnAmplitudeContrast": float,
"_rlnImageName": str,
"_rlnOriginalName": str,
"_rlnCtfImage": str,
"_rlnCoordinateX": float,
"_rlnCoordinateY": float,
"_rlnCoordinateZ": float,
"_rlnNormCorrection": float,
"_rlnMicrographName": str,
"_rlnGroupName": str,
"_rlnGroupNumber": str,
"_rlnOriginX": float,
"_rlnOriginY": float,
"_rlnAngleRot": float,
"_rlnAngleTilt": float,
"_rlnAnglePsi": float,
"_rlnClassNumber": int,
"_rlnLogLikeliContribution": float,
"_rlnRandomSubset": int,
"_rlnParticleName": str,
"_rlnOriginalParticleName": str,
"_rlnNrOfSignificantSamples": float,
"_rlnNrOfFrames": int,
"_rlnMaxValueProbDistribution": float,
}

def __init__(self, L, n, dtype="double", metadata=None, memory=None):
"""
A Cryo-EM ImageSource object that supplies images along with other parameters for image manipulation.
Expand Down Expand Up @@ -150,34 +112,6 @@ def filter_indices(self):
@filter_indices.setter
def filter_indices(self, indices):
# create metadata of filters for all images
if indices is None:
filter_values = np.nan
else:
attribute_list = (
"voltage",
"defocus_u",
"defocus_v",
"defocus_ang",
"Cs",
"alpha",
)
filter_values = np.zeros((len(indices), len(attribute_list)))
for i, filt in enumerate(self.unique_filters):
filter_values[indices == i] = [
getattr(filt, attribute, np.nan) for attribute in attribute_list
]

self.set_metadata(
[
"_rlnVoltage",
"_rlnDefocusU",
"_rlnDefocusV",
"_rlnDefocusAngle",
"_rlnSphericalAberration",
"_rlnAmplitudeContrast",
],
filter_values,
)
return self.set_metadata(["__filter_indices"], indices)

@property
Expand Down
203 changes: 136 additions & 67 deletions src/aspire/source/relion.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,44 +8,60 @@
import pandas as pd

from aspire.image import Image
from aspire.operators import CTFFilter
from aspire.operators import CTFFilter, IdentityFilter
from aspire.source import ImageSource
from aspire.storage import StarFile
from aspire.utils import ensure

logger = logging.getLogger(__name__)


class RelionSource(ImageSource):
@classmethod
def starfile2df(cls, filepath, data_folder=None, max_rows=None):
if data_folder is not None:
if not os.path.isabs(data_folder):
data_folder = os.path.join(os.path.dirname(filepath), data_folder)
else:
data_folder = os.path.dirname(filepath)

# Note: Valid Relion image "_data.star" files have to have their data in the first loop of the first block.
# We are getting the first (and only) block in this StarFile object
df = StarFile(filepath).get_block_by_index(0)
column_types = {name: cls.metadata_fields.get(name, str) for name in df.columns}
df = df.astype(column_types)

df[["__mrc_index", "__mrc_filename"]] = df["_rlnImageName"].str.split(
"@", 1, expand=True
)
df["__mrc_index"] = pd.to_numeric(df["__mrc_index"])

# Adding a full-filepath field to the Dataframe helps us save time later
# Note that os.path.join works as expected when the second argument is an absolute path itself
df["__mrc_filepath"] = df["__mrc_filename"].apply(
lambda filename: os.path.join(data_folder, filename)
)

if max_rows is None:
return df
else:
return df.iloc[:max_rows]
"""
A RelionSource represents a source of picked and cropped particles stored as slices in a `.mrcs` stack.
It must be instantiated via a STAR file, which--at a minumum--lists the particles in each `.mrcs` stack in the
`_rlnImageName` column. The STAR file may also contain Relion-specific metadata columns. This information
is read into a Pandas DataFrame table containing a row for each particle specifying its location and
its metadata. The metadata table may be augmented or modified via helper methods found in ImageSource. It may
store, for example, Filter objects added during preprocessing.
"""

# The metadata_fields dictionary below specifies default data types
# of certain key fields used in the codebase,
# which are originally read from Relion STAR files.
relion_metadata_fields = {
"_rlnVoltage": float,
"_rlnDefocusU": float,
"_rlnDefocusV": float,
"_rlnDefocusAngle": float,
"_rlnSphericalAberration": float,
"_rlnDetectorPixelSize": float,
"_rlnCtfFigureOfMerit": float,
"_rlnMagnification": float,
"_rlnAmplitudeContrast": float,
"_rlnImageName": str,
"_rlnOriginalName": str,
"_rlnCtfImage": str,
"_rlnCoordinateX": float,
"_rlnCoordinateY": float,
"_rlnCoordinateZ": float,
"_rlnNormCorrection": float,
"_rlnMicrographName": str,
"_rlnGroupName": str,
"_rlnGroupNumber": str,
"_rlnOriginX": float,
"_rlnOriginY": float,
"_rlnAngleRot": float,
"_rlnAngleTilt": float,
"_rlnAnglePsi": float,
"_rlnClassNumber": int,
"_rlnLogLikeliContribution": float,
"_rlnRandomSubset": int,
"_rlnParticleName": str,
"_rlnOriginalParticleName": str,
"_rlnNrOfSignificantSamples": float,
"_rlnNrOfFrames": int,
"_rlnMaxValueProbDistribution": float,
}

def __init__(
self,
Expand Down Expand Up @@ -77,7 +93,7 @@ def __init__(
self.B = B
self.n_workers = n_workers

metadata = self.__class__.starfile2df(filepath, data_folder, max_rows)
metadata = self.populate_metadata(filepath, data_folder, max_rows)

n = len(metadata)
if n == 0:
Expand All @@ -90,10 +106,10 @@ def __init__(
# Get the 'mode' (data type) - TODO: There's probably a more direct way to do this.
mode = int(mrc.header.mode)
dtypes = {0: "int8", 1: "int16", 2: "float32", 6: "uint16"}
ensure(
mode in dtypes,
f"Only modes={list(dtypes.keys())} in MRC files are supported for now.",
)
assert (
mode in dtypes
), f"Only modes={list(dtypes.keys())} in MRC files are supported for now."

dtype = dtypes[mode]

shape = mrc.data.shape
Expand All @@ -103,48 +119,100 @@ def __init__(
if len(shape) == 2:
shape = (1,) + shape

ensure(shape[1] == shape[2], "Only square images are supported")
assert shape[1] == shape[2], "Only square images are supported"
L = shape[1]
logger.debug(f"Image size = {L}x{L}")

# Save original image resolution that we expect to use when we start reading actual data
self._original_resolution = L

filter_params, filter_indices = np.unique(
metadata[
[
"_rlnVoltage",
"_rlnDefocusU",
"_rlnDefocusV",
"_rlnDefocusAngle",
"_rlnSphericalAberration",
"_rlnAmplitudeContrast",
]
].values,
return_inverse=True,
axis=0,
ImageSource.__init__(
self, L=L, n=n, dtype=dtype, metadata=metadata, memory=memory
)

filters = []
for row in filter_params:
filters.append(
CTFFilter(
pixel_size=self.pixel_size,
voltage=row[0],
defocus_u=row[1],
defocus_v=row[2],
defocus_ang=row[3] * np.pi / 180, # degrees to radians
Cs=row[4],
alpha=row[5],
B=B,
)
# CTF estimation parameters coming from Relion
CTF_params = [
"_rlnVoltage",
"_rlnDefocusU",
"_rlnDefocusV",
"_rlnDefocusAngle",
"_rlnSphericalAberration",
"_rlnAmplitudeContrast",
]
# If these exist in the STAR file, we may create CTF filters for the source
if set(CTF_params).issubset(metadata.columns):
# partition particles according to unique CTF parameters
filter_params, filter_indices = np.unique(
metadata[CTF_params].values, return_inverse=True, axis=0
)
filters = []
# for each unique CTF configuration, create a CTFFilter object
for row in filter_params:
filters.append(
CTFFilter(
pixel_size=self.pixel_size,
voltage=row[0],
defocus_u=row[1],
defocus_v=row[2],
defocus_ang=row[3] * np.pi / 180, # degrees to radians
Cs=row[4],
alpha=row[5],
B=B,
)
)
self.unique_filters = filters
# filter_indices stores, for each particle index, the index in
# self.unique_filters of the filter that should be applied
self.filter_indices = filter_indices
# If no CTF info in STAR, we initialize the filter values of metadata with default values
else:
self.unique_filters = [IdentityFilter()]
self.filter_indices = np.zeros(self.n, dtype=int)

ImageSource.__init__(
self, L=L, n=n, dtype=dtype, metadata=metadata, memory=memory
def populate_metadata(self, filepath, data_folder=None, max_rows=None):
"""
Relion STAR files may contain a large number of metadata columns in addition
to the locations of particles. We read this into a Pandas DataFrame and add some of
our own columns for convenience.
"""
if data_folder is not None:
if not os.path.isabs(data_folder):
data_folder = os.path.join(os.path.dirname(filepath), data_folder)
else:
data_folder = os.path.dirname(filepath)

# Valid Relion STAR files always have their data in the first loop of the first block.
# We are getting the first (and only) block in this StarFile object
df = StarFile(filepath).get_block_by_index(0)
# convert STAR file strings to data type for each field
# columns without a specified data type are read as dtype=object
column_types = {
name: RelionSource.relion_metadata_fields.get(name, str)
for name in df.columns
}
df = df.astype(column_types)

# particle locations are stored as e.g. '000001@first_micrograph.mrcs'
# in the _rlnImageName column. here, we're splitting this information
# so we can get the particle's index in the .mrcs stack as an int
df[["__mrc_index", "__mrc_filename"]] = df["_rlnImageName"].str.split(
"@", 1, expand=True
)
self.unique_filters = filters
self.filter_indices = filter_indices
# __mrc_index corresponds to the integer index of the particle in the __mrc_filename stack
# Note that this is 1-based indexing
df["__mrc_index"] = pd.to_numeric(df["__mrc_index"])

# Adding a full-filepath field to the Dataframe helps us save time later
# Note that os.path.join works as expected when the second argument is an absolute path itself
df["__mrc_filepath"] = df["__mrc_filename"].apply(
lambda filename: os.path.join(data_folder, filename)
)

if max_rows is None:
return df
else:
max_rows = min(max_rows, len(df))
return df.iloc[:max_rows]

def __str__(self):
return f"RelionSource ({self.n} images of size {self.L}x{self.L})"
Expand All @@ -162,6 +230,7 @@ def load_single_mrcs(filepath, df):
# the code below reshapes it to (1, resolution, resolution)
if len(arr.shape) == 2:
arr = arr.reshape((1,) + arr.shape)
# __mrc_index is the 1-based index of the particle in the stack
data = arr[df["__mrc_index"] - 1, :, :]

return df.index, data
Expand Down
36 changes: 36 additions & 0 deletions src/aspire/source/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def __init__(
if unique_filters:
if filter_indices is None:
filter_indices = randi(len(unique_filters), n, seed=seed) - 1
self._populate_ctf_metadata(filter_indices)
self.filter_indices = filter_indices
else:
self.filter_indices = np.zeros(n)
Expand All @@ -105,6 +106,41 @@ def __init__(
logger.info("Appending a NoiseAdder to generation pipeline")
self.noise_adder = NoiseAdder(seed=self.seed, noise_filter=noise_filter)

def _populate_ctf_metadata(self, filter_indices):
# Since we are not reading from a starfile, we must construct
# metadata based on the CTF filters by hand and set the values
# for these columns
#
# class attributes of CTFFilter:
CTFFilter_attributes = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if these would be better as a dictionary mapping between CTFFilter_attributes and CTF_params. They look similar. At least similar enough I had to look if exactly the same...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah there is a one-to-one correspondence. But the two sets of strings are used separately, so I think to do the set_metadata line that follows this line, I'd have to do something like CTFDict.keys(), which I would probably find more confusing if I were a newcomer. Unless I'm missing something

"voltage",
"defocus_u",
"defocus_v",
"defocus_ang",
"Cs",
"alpha",
)
# get the CTF parameters, if they exist, for each filter
# and for each image (indexed by filter_indices)
filter_values = np.zeros((len(filter_indices), len(CTFFilter_attributes)))
for i, filt in enumerate(self.unique_filters):
filter_values[filter_indices == i] = [
getattr(filt, att, np.nan) for att in CTFFilter_attributes
]
# set the corresponding Relion metadata values that we would expect
# from a STAR file
self.set_metadata(
[
"_rlnVoltage",
"_rlnDefocusU",
"_rlnDefocusV",
"_rlnDefocusAngle",
"_rlnSphericalAberration",
"_rlnAmplitudeContrast",
],
filter_values,
)

def projections(self, start=0, num=np.inf, indices=None):
"""
Return projections of generated volumes, without applying filters/shifts/amplitudes/noise
Expand Down