88import pandas as pd
99
1010from aspire .image import Image
11- from aspire .operators import CTFFilter
11+ from aspire .operators import CTFFilter , IdentityFilter
1212from aspire .source import ImageSource
1313from aspire .storage import StarFile
14- from aspire .utils import ensure
1514
1615logger = logging .getLogger (__name__ )
1716
1817
1918class RelionSource (ImageSource ):
20- @classmethod
21- def starfile2df (cls , filepath , data_folder = None , max_rows = None ):
22- if data_folder is not None :
23- if not os .path .isabs (data_folder ):
24- data_folder = os .path .join (os .path .dirname (filepath ), data_folder )
25- else :
26- data_folder = os .path .dirname (filepath )
27-
28- # Note: Valid Relion image "_data.star" files have to have their data in the first loop of the first block.
29- # We are getting the first (and only) block in this StarFile object
30- df = StarFile (filepath ).get_block_by_index (0 )
31- column_types = {name : cls .metadata_fields .get (name , str ) for name in df .columns }
32- df = df .astype (column_types )
33-
34- df [["__mrc_index" , "__mrc_filename" ]] = df ["_rlnImageName" ].str .split (
35- "@" , 1 , expand = True
36- )
37- df ["__mrc_index" ] = pd .to_numeric (df ["__mrc_index" ])
38-
39- # Adding a full-filepath field to the Dataframe helps us save time later
40- # Note that os.path.join works as expected when the second argument is an absolute path itself
41- df ["__mrc_filepath" ] = df ["__mrc_filename" ].apply (
42- lambda filename : os .path .join (data_folder , filename )
43- )
44-
45- if max_rows is None :
46- return df
47- else :
48- return df .iloc [:max_rows ]
19+ """
20+ A RelionSource represents a source of picked and cropped particles stored as slices in a `.mrcs` stack.
21+ It must be instantiated via a STAR file, which--at a minumum--lists the particles in each `.mrcs` stack in the
22+ `_rlnImageName` column. The STAR file may also contain Relion-specific metadata columns. This information
23+ is read into a Pandas DataFrame table containing a row for each particle specifying its location and
24+ its metadata. The metadata table may be augmented or modified via helper methods found in ImageSource. It may
25+ store, for example, Filter objects added during preprocessing.
26+ """
27+
28+ # The metadata_fields dictionary below specifies default data types
29+ # of certain key fields used in the codebase,
30+ # which are originally read from Relion STAR files.
31+ relion_metadata_fields = {
32+ "_rlnVoltage" : float ,
33+ "_rlnDefocusU" : float ,
34+ "_rlnDefocusV" : float ,
35+ "_rlnDefocusAngle" : float ,
36+ "_rlnSphericalAberration" : float ,
37+ "_rlnDetectorPixelSize" : float ,
38+ "_rlnCtfFigureOfMerit" : float ,
39+ "_rlnMagnification" : float ,
40+ "_rlnAmplitudeContrast" : float ,
41+ "_rlnImageName" : str ,
42+ "_rlnOriginalName" : str ,
43+ "_rlnCtfImage" : str ,
44+ "_rlnCoordinateX" : float ,
45+ "_rlnCoordinateY" : float ,
46+ "_rlnCoordinateZ" : float ,
47+ "_rlnNormCorrection" : float ,
48+ "_rlnMicrographName" : str ,
49+ "_rlnGroupName" : str ,
50+ "_rlnGroupNumber" : str ,
51+ "_rlnOriginX" : float ,
52+ "_rlnOriginY" : float ,
53+ "_rlnAngleRot" : float ,
54+ "_rlnAngleTilt" : float ,
55+ "_rlnAnglePsi" : float ,
56+ "_rlnClassNumber" : int ,
57+ "_rlnLogLikeliContribution" : float ,
58+ "_rlnRandomSubset" : int ,
59+ "_rlnParticleName" : str ,
60+ "_rlnOriginalParticleName" : str ,
61+ "_rlnNrOfSignificantSamples" : float ,
62+ "_rlnNrOfFrames" : int ,
63+ "_rlnMaxValueProbDistribution" : float ,
64+ }
4965
5066 def __init__ (
5167 self ,
@@ -77,7 +93,7 @@ def __init__(
7793 self .B = B
7894 self .n_workers = n_workers
7995
80- metadata = self .__class__ . starfile2df (filepath , data_folder , max_rows )
96+ metadata = self .populate_metadata (filepath , data_folder , max_rows )
8197
8298 n = len (metadata )
8399 if n == 0 :
@@ -90,10 +106,10 @@ def __init__(
90106 # Get the 'mode' (data type) - TODO: There's probably a more direct way to do this.
91107 mode = int (mrc .header .mode )
92108 dtypes = {0 : "int8" , 1 : "int16" , 2 : "float32" , 6 : "uint16" }
93- ensure (
94- mode in dtypes ,
95- f"Only modes={ list (dtypes .keys ())} in MRC files are supported for now." ,
96- )
109+ assert (
110+ mode in dtypes
111+ ), f"Only modes={ list (dtypes .keys ())} in MRC files are supported for now."
112+
97113 dtype = dtypes [mode ]
98114
99115 shape = mrc .data .shape
@@ -103,48 +119,100 @@ def __init__(
103119 if len (shape ) == 2 :
104120 shape = (1 ,) + shape
105121
106- ensure ( shape [1 ] == shape [2 ], "Only square images are supported" )
122+ assert shape [1 ] == shape [2 ], "Only square images are supported"
107123 L = shape [1 ]
108124 logger .debug (f"Image size = { L } x{ L } " )
109125
110126 # Save original image resolution that we expect to use when we start reading actual data
111127 self ._original_resolution = L
112128
113- filter_params , filter_indices = np .unique (
114- metadata [
115- [
116- "_rlnVoltage" ,
117- "_rlnDefocusU" ,
118- "_rlnDefocusV" ,
119- "_rlnDefocusAngle" ,
120- "_rlnSphericalAberration" ,
121- "_rlnAmplitudeContrast" ,
122- ]
123- ].values ,
124- return_inverse = True ,
125- axis = 0 ,
129+ ImageSource .__init__ (
130+ self , L = L , n = n , dtype = dtype , metadata = metadata , memory = memory
126131 )
127132
128- filters = []
129- for row in filter_params :
130- filters .append (
131- CTFFilter (
132- pixel_size = self .pixel_size ,
133- voltage = row [0 ],
134- defocus_u = row [1 ],
135- defocus_v = row [2 ],
136- defocus_ang = row [3 ] * np .pi / 180 , # degrees to radians
137- Cs = row [4 ],
138- alpha = row [5 ],
139- B = B ,
140- )
133+ # CTF estimation parameters coming from Relion
134+ CTF_params = [
135+ "_rlnVoltage" ,
136+ "_rlnDefocusU" ,
137+ "_rlnDefocusV" ,
138+ "_rlnDefocusAngle" ,
139+ "_rlnSphericalAberration" ,
140+ "_rlnAmplitudeContrast" ,
141+ ]
142+ # If these exist in the STAR file, we may create CTF filters for the source
143+ if set (CTF_params ).issubset (metadata .columns ):
144+ # partition particles according to unique CTF parameters
145+ filter_params , filter_indices = np .unique (
146+ metadata [CTF_params ].values , return_inverse = True , axis = 0
141147 )
148+ filters = []
149+ # for each unique CTF configuration, create a CTFFilter object
150+ for row in filter_params :
151+ filters .append (
152+ CTFFilter (
153+ pixel_size = self .pixel_size ,
154+ voltage = row [0 ],
155+ defocus_u = row [1 ],
156+ defocus_v = row [2 ],
157+ defocus_ang = row [3 ] * np .pi / 180 , # degrees to radians
158+ Cs = row [4 ],
159+ alpha = row [5 ],
160+ B = B ,
161+ )
162+ )
163+ self .unique_filters = filters
164+ # filter_indices stores, for each particle index, the index in
165+ # self.unique_filters of the filter that should be applied
166+ self .filter_indices = filter_indices
167+ # If no CTF info in STAR, we initialize the filter values of metadata with default values
168+ else :
169+ self .unique_filters = [IdentityFilter ()]
170+ self .filter_indices = np .zeros (self .n , dtype = int )
142171
143- ImageSource .__init__ (
144- self , L = L , n = n , dtype = dtype , metadata = metadata , memory = memory
172+ def populate_metadata (self , filepath , data_folder = None , max_rows = None ):
173+ """
174+ Relion STAR files may contain a large number of metadata columns in addition
175+ to the locations of particles. We read this into a Pandas DataFrame and add some of
176+ our own columns for convenience.
177+ """
178+ if data_folder is not None :
179+ if not os .path .isabs (data_folder ):
180+ data_folder = os .path .join (os .path .dirname (filepath ), data_folder )
181+ else :
182+ data_folder = os .path .dirname (filepath )
183+
184+ # Valid Relion STAR files always have their data in the first loop of the first block.
185+ # We are getting the first (and only) block in this StarFile object
186+ df = StarFile (filepath ).get_block_by_index (0 )
187+ # convert STAR file strings to data type for each field
188+ # columns without a specified data type are read as dtype=object
189+ column_types = {
190+ name : RelionSource .relion_metadata_fields .get (name , str )
191+ for name in df .columns
192+ }
193+ df = df .astype (column_types )
194+
195+ # particle locations are stored as e.g. '000001@first_micrograph.mrcs'
196+ # in the _rlnImageName column. here, we're splitting this information
197+ # so we can get the particle's index in the .mrcs stack as an int
198+ df [["__mrc_index" , "__mrc_filename" ]] = df ["_rlnImageName" ].str .split (
199+ "@" , 1 , expand = True
145200 )
146- self .unique_filters = filters
147- self .filter_indices = filter_indices
201+ # __mrc_index corresponds to the integer index of the particle in the __mrc_filename stack
202+ # Note that this is 1-based indexing
203+ df ["__mrc_index" ] = pd .to_numeric (df ["__mrc_index" ])
204+
205+ # Adding a full-filepath field to the Dataframe helps us save time later
206+ # Note that os.path.join works as expected when the second argument is an absolute path itself
207+ df ["__mrc_filepath" ] = df ["__mrc_filename" ].apply (
208+ lambda filename : os .path .join (data_folder , filename )
209+ )
210+
211+ if max_rows is None :
212+ return df
213+ else :
214+ max_rows = min (max_rows , len (df ))
215+ return df .iloc [:max_rows ]
148216
149217 def __str__ (self ):
150218 return f"RelionSource ({ self .n } images of size { self .L } x{ self .L } )"
@@ -162,6 +230,7 @@ def load_single_mrcs(filepath, df):
162230 # the code below reshapes it to (1, resolution, resolution)
163231 if len (arr .shape ) == 2 :
164232 arr = arr .reshape ((1 ,) + arr .shape )
233+ # __mrc_index is the 1-based index of the particle in the stack
165234 data = arr [df ["__mrc_index" ] - 1 , :, :]
166235
167236 return df .index , data
0 commit comments