55
66import mrcfile
77import numpy as np
8- import pandas as pd
98
109from aspire .image import Image
1110from aspire .operators import CTFFilter , IdentityFilter
@@ -20,7 +19,7 @@ class RelionSource(ImageSource):
2019 A RelionSource represents a source of picked and cropped particles stored as slices in a `.mrcs` stack.
2120 It must be instantiated via a STAR file, which--at a minumum--lists the particles in each `.mrcs` stack in the
2221 `_rlnImageName` column. The STAR file may also contain Relion-specific metadata columns. This information
23- is read into a Pandas DataFrame table containing a row for each particle specifying its location and
22+ is read into dictionaries containing rows for each particle specifying its location and
2423 its metadata. The metadata table may be augmented or modified via helper methods found in ImageSource. It may
2524 store, for example, Filter objects added during preprocessing.
2625 """
@@ -61,12 +60,12 @@ def __init__(
6160
6261 metadata = self .populate_metadata ()
6362
64- n = len (metadata )
63+ n = len (metadata [ "__mrc_filepath" ] )
6564 if n == 0 :
6665 raise RuntimeError ("No mrcs files found for starfile!" )
6766
6867 # Peek into the first image and populate some attributes
69- first_mrc_filepath = metadata . loc [ 0 ][ "__mrc_filepath" ]
68+ first_mrc_filepath = metadata [ "__mrc_filepath" ][ 0 ]
7069 mrc = mrcfile .open (first_mrc_filepath )
7170
7271 # Get the 'mode' (data type) - TODO: There's probably a more direct way to do this.
@@ -106,10 +105,13 @@ def __init__(
106105 "_rlnAmplitudeContrast" ,
107106 ]
108107 # If these all exist in the STAR file, we may create CTF filters for the source
109- if set (CTF_params ).issubset (metadata .columns ):
108+ if set (CTF_params ).issubset (metadata .keys () ):
110109 # partition particles according to unique CTF parameters
110+ ctf_data = np .stack (metadata [k ] for k in CTF_params ).T
111111 filter_params , filter_indices = np .unique (
112- metadata [CTF_params ].values , return_inverse = True , axis = 0
112+ ctf_data ,
113+ return_inverse = True ,
114+ axis = 0 ,
113115 )
114116 filters = []
115117 # for each unique CTF configuration, create a CTFFilter object
@@ -132,7 +134,7 @@ def __init__(
132134 self .filter_indices = filter_indices
133135
134136 # We have provided some, but not all the required params
135- elif any (param in metadata . columns for param in CTF_params ):
137+ elif any (param in metadata for param in CTF_params ):
136138 logger .warning (
137139 f"Found partially populated CTF Params."
138140 f" To automatically populate CTFFilters provide { CTF_params } "
@@ -151,7 +153,7 @@ def __init__(
151153 def populate_metadata (self ):
152154 """
153155 Relion STAR files may contain a large number of metadata columns in addition
154- to the locations of particles. We read this into a Pandas DataFrame and add some of
156+ to the locations of particles. We read this into a dict and add some of
155157 our own columns for convenience.
156158 """
157159 if self .data_folder is not None :
@@ -167,25 +169,24 @@ def populate_metadata(self):
167169 # particle locations are stored as e.g. '000001@first_micrograph.mrcs'
168170 # in the _rlnImageName column. here, we're splitting this information
169171 # so we can get the particle's index in the .mrcs stack as an int
170- metadata [["__mrc_index" , "__mrc_filename" ]] = metadata [
171- "_rlnImageName"
172- ].str .split ("@" , n = 1 , expand = True )
172+ indices_filenames = [s .split ("@" ) for s in metadata ["_rlnImageName" ]]
173173 # __mrc_index corresponds to the integer index of the particle in the __mrc_filename stack
174174 # Note that this is 1-based indexing
175- metadata ["__mrc_index" ] = pd .to_numeric (metadata ["__mrc_index" ])
175+ metadata ["__mrc_index" ] = np .array ([int (s [0 ]) for s in indices_filenames ])
176+ metadata ["__mrc_filename" ] = np .array ([s [1 ] for s in indices_filenames ])
176177
177178 # Adding a full-filepath field to the Dataframe helps us save time later
178179 # Note that os.path.join works as expected when the second argument is an absolute path itself
179- metadata ["__mrc_filepath" ] = metadata [ "__mrc_filename" ]. apply (
180- lambda filename : os .path .join (self .data_folder , filename )
180+ metadata ["__mrc_filepath" ] = np . array (
181+ [ os .path .join (self .data_folder , p ) for p in metadata [ "__mrc_filename" ]]
181182 )
182183
183184 # finally, chop off the metadata df at max_rows
184185 if self .max_rows is None :
185186 return metadata
186187 else :
187- max_rows = min (self .max_rows , len (metadata ))
188- return metadata . iloc [:max_rows ]
188+ max_rows = min (self .max_rows , len (metadata [ "__mrc_filepath" ] ))
189+ return { k : v [:max_rows ] for k , v in metadata . items ()}
189190
190191 def __str__ (self ):
191192 return f"RelionSource ({ self .n } images of size { self .L } x{ self .L } )"
@@ -209,34 +210,38 @@ def _images(self, indices):
209210 # Log the indices in case needed to debug a crash
210211 logger .debug (f"Indices: { indices } " )
211212
212- def load_single_mrcs (filepath , df ):
213+ def load_single_mrcs (filepath , indices ):
213214 arr = mrcfile .open (filepath ).data
214215 # if the stack only contains one image, arr will have shape (resolution, resolution)
215216 # the code below reshapes it to (1, resolution, resolution)
216217 if len (arr .shape ) == 2 :
217218 arr = arr .reshape ((1 ,) + arr .shape )
218219 # __mrc_index is the 1-based index of the particle in the stack
219- data = arr [df ["__mrc_index" ] - 1 , :, :]
220+ data = arr [self . _metadata ["__mrc_index" ][ indices ] - 1 , :, :]
220221
221- return df . index , data
222+ return indices , data
222223
223224 n_workers = self .n_workers
224225 if n_workers < 0 :
225226 n_workers = cpu_count () - 1
226227
227- df = self ._metadata .loc [indices ]
228228 im = np .empty (
229229 (len (indices ), self ._original_resolution , self ._original_resolution ),
230230 dtype = self .dtype ,
231231 )
232232
233- groups = df .groupby ("__mrc_filepath" )
234- n_workers = min (n_workers , len (groups ))
233+ filepaths , filepath_indices = np .unique (
234+ self ._metadata ["__mrc_filepath" ], return_inverse = True
235+ )
236+ n_workers = min (n_workers , len (filepaths ))
235237
236238 with futures .ThreadPoolExecutor (n_workers ) as executor :
237239 to_do = []
238- for filepath , _df in groups :
239- future = executor .submit (load_single_mrcs , filepath , _df )
240+ for i , filepath in enumerate (filepaths ):
241+ this_filepath_indices = np .where (filepath_indices == i )[0 ]
242+ future = executor .submit (
243+ load_single_mrcs , filepath , this_filepath_indices
244+ )
240245 to_do .append (future )
241246
242247 for future in futures .as_completed (to_do ):
0 commit comments