From b68a844ce242b5b5f959fa587835b6bee85dceaf Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sat, 8 Oct 2016 23:30:45 -0400 Subject: [PATCH 001/181] RF(possibly BK): removing session embedding into seqinfo, preparing for getting arb set of files Decided to split away 'session' notion, which was pretty much used in the script only whenever multiple tarballs are provided, as a sign of multiple sessions. Otherwise -- session is given on cmdline, and thus processing that particular session might be 'incompatible' with future runs for other sessions if we save the entire mapping in a single file which we might load, and which wouldn't have that session information. Also, to allow for consumption of arbitrary set of dicoms, which might be coming from different studies and sessions, we need to analyze/group before extracting useful session sequence information. So, that was also in preparation to that --- bin/heudiconv | 473 ++++++++++++++++++++++++++-------------- heuristics/dbic_bids.py | 2 + 2 files changed, 311 insertions(+), 164 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index be56038c..5737de95 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -26,6 +26,7 @@ from tempfile import mkdtemp import tarfile from collections import namedtuple +from collections import defaultdict from os.path import isdir import logging @@ -61,6 +62,47 @@ SeqInfo = namedtuple( ] ) +StudySessionInfo = namedtuple( + 'StudySessionInfo', + [ + 'locator', # possible prefix identifying the study, e.g. PI/dataset or just a dataset or empty (default) + # Note that ATM there should be no multiple DICOMs with the same + # StudyInstanceUID which would collide, i.e point to the same + # subject/session. + # So 'locator' is pretty much an assignment from StudyInstanceUID + # into some place within hierarchy + 'session', # could be None + 'subject', # should be some ID defined either in cmdline or deduced + ] +) + + +class TempDirs(object): + """A helper to centralize handling and cleanup of dirs""" + + def __init__(self): + self.dirs = [] + + def __call__(self, prefix=None): + tmpdir = mkdtemp(prefix=prefix) + self.dirs.append(tmpdir) + return tmpdir + + def __del__(self): + self.cleanup() + + def cleanup(self): + for t in self.dirs[:]: + self.rmtree(t) + + def rmtree(self, tmpdir): + if os.path.exists(tmpdir): + shutil.rmtree(tmpdir) + if tmpdir in self.dirs: + self.dirs.remove(tmpdir) + +tempdirs = TempDirs() + def save_json(filename, data): """Save data to a json file @@ -142,92 +184,83 @@ def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False) find_files.__doc__ %= (_VCS_REGEX,) -def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): - """Process list of dicoms and return seqinfo and file group +def group_dicoms_into_seqinfos(fl, dcmfilter=None): + """Process list of dicoms and return seqinfo and file group + + `seqinfo` contains per-sequence extract of fields from DICOMs which + will be later provided into heuristics to decide on filenames Parameters ---------- fl : list of str List of files to consider - dcmsession : list of int, optional - List of session ids (?) for each file if `fl` - corresponds to multiple sessions - basedir : str, optional - Base directory relative to which filenames are provided in `fl` dcmfilter : callable, optional If called on dcm_data and returns True, it is used to set series_id Returns ------- - seqinfo, filegrp : list of list, dict + seqinfo : list of list `seqinfo` is a list of info entries per each sequence (some entry there defines a key for `filegrp`) - `filegrp` is a dictionary with files groupped per each sequence. + filegrp : dict + `filegrp` is a dictionary with files groupped per each sequence """ + lgr.info("Analyzing %d dicoms", len(fl)) import dcmstack as ds import dicom as dcm - if dcmsession is None: - multi_session = False - dcmsession = [0] * len(fl) - else: - multi_session = True - groups = [[], []] mwgroup = [] + for fidx, filename in enumerate(fl): - if not basedir is None: - filename = os.path.join(basedir, filename) mw = ds.wrapper_from_data(dcm.read_file(filename, force=True)) + + for f in ('iop', 'ICE_Dims', 'SequenceName'): + try: + del mw.series_signature[f] + except: + pass + try: - del mw.series_signature['iop'] - except: - pass - try: - del mw.series_signature['ICE_Dims'] - except: - pass - try: - del mw.series_signature['SequenceName'] - except: - pass - try: - series_id = (dcmsession[fidx], - mw.dcm_data.SeriesNumber, + series_id = (mw.dcm_data.SeriesNumber, mw.dcm_data.ProtocolName) - except AttributeError: + except AttributeError as exc: + lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', + filename, exc) # not a normal DICOM -> ignore - series_id = (dcmsession[fidx], -1, 'none') + series_id = (-1, 'none') + if not series_id[0] < 0: if dcmfilter is not None and dcmfilter(mw.dcm_data): - series_id = (dcmsession[fidx], -1, mw.dcm_data.ProtocolName) + series_id = (-1, mw.dcm_data.ProtocolName) + if not groups: + # yoh: I don't think this would ever be executed! mwgroup.append(mw) groups[0].append(series_id) groups[1].append(len(mwgroup) - 1) continue - N = len(mwgroup) # filter out unwanted non-image-data DICOMs by assigning # a series number < 0 (see test below) if not series_id[1] < 0 and mw.dcm_data[0x0008, 0x0016].repval in ( 'Raw Data Storage', 'GrayscaleSoftcopyPresentationStateStorage'): - series_id = (dcmsession[fidx], -1, mw.dcm_data.ProtocolName) + series_id = (-1, mw.dcm_data.ProtocolName) #print fidx, N, filename ingrp = False - for idx in range(N): + for idx in range(len(mwgroup)): same = mw.is_same_series(mwgroup[idx]) #print idx, same, groups[idx][0] if same: ingrp = True - if series_id[1] >= 0: - series_id = (dcmsession[fidx], - mwgroup[idx].dcm_data.SeriesNumber, + if series_id[0] >= 0: + series_id = (mwgroup[idx].dcm_data.SeriesNumber, mwgroup[idx].dcm_data.ProtocolName) groups[0].append(series_id) groups[1].append(idx) + if not ingrp: mwgroup.append(mw) groups[0].append(series_id) @@ -240,8 +273,8 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): seqinfo = [] # for the next line to make any sense the series_id needs to # be sortable in a way that preserves the series order - for series, mwidx in sorted(group_map.items()): - if series[1] < 0: + for series_id, mwidx in sorted(group_map.items()): + if series_id[0] < 0: # skip our fake series with unwanted files continue mw = mwgroup[mwidx] @@ -250,14 +283,13 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): # nothing to see here, just move on continue dcminfo = mw.dcm_data - files = [fl[i] for i, s in enumerate(groups[0]) if s == series] + files = [fl[i] for i, s in enumerate(groups[0]) if s == series_id] # turn the series_id into a human-readable string -- string is needed # for JSON storage later on - if multi_session: - series = '%i-%i-%s' % series - else: - series = '%i-%s' % series[1:] - filegroup[series] = files + series_id = '%i-%s' % series_id + if series_id in filegroup: + raise RuntimeError("Already processed series %r" % series_id) + filegroup[series_id] = files size = list(mw.image_shape) + [len(files)] total += size[-1] if len(size) < 4: @@ -274,7 +306,7 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): info = SeqInfo( total, os.path.split(files[0])[1], - series, + series_id, '-', '-', '-', size[0], size[1], size[2], size[3], TR, TE, @@ -303,10 +335,9 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): len(dcminfo.get('SourceImageSequence', '')), info.image_type )) - #if dcminfo.SeriesDescription == 'ep2d_bold_moco_p2': - # import pdb; pdb.set_trace() seqinfo.append(info) - lgr.info("Generated sequence info with %d entries", len(info)) + + lgr.info("Generated sequence info with %d entries", len(seqinfo)) return seqinfo, filegroup @@ -322,7 +353,7 @@ def read_config(infile): return info -def conversion_info(subject, outdir, info, filegroup, basedir=None, ses=None): +def conversion_info(subject, outdir, info, filegroup, ses=None): convert_info = [] for key, items in info.items(): if not items: @@ -360,9 +391,6 @@ def conversion_info(subject, outdir, info, filegroup, basedir=None, ses=None): except KeyError: files = filegroup[unicode(item)] - if basedir is not None: - files = [os.path.join(basedir, f) for f in files] - outprefix = template.format(**parameters) convert_info.append((os.path.join(outpath, outprefix), outtype, files)) @@ -421,16 +449,16 @@ def convert(items, symlink=True, converter=None, else: outtypes = [item[1]] prefix = item[0] - print('Converting %s' % prefix) dirname = os.path.dirname(prefix + '.ext') outname_bids = prefix + '.json' - print(dirname) + lgr.info('Converting %s -> %s . Converter: %s', + prefix, dirname, converter) if not os.path.exists(dirname): os.makedirs(dirname) for outtype in outtypes: item_dicoms = item[2] lgr.info("Processing %d dicoms", len(item_dicoms)) - lgr.debug(" those dicoms are: %s", item_dicoms) + lgr.log(1, " those dicoms are: %s", item_dicoms) if outtype == 'dicom': if is_bids: if not os.path.exists(sourcedir): @@ -460,7 +488,6 @@ def convert(items, symlink=True, converter=None, config.enable_provenance() from nipype import Function, Node from nipype.interfaces.base import isdefined - print(converter) if converter == 'mri_convert': from nipype.interfaces.freesurfer.preprocess import MRIConvert convertnode = Node(MRIConvert(), name='convert') @@ -489,7 +516,7 @@ def convert(items, symlink=True, converter=None, res = convertnode.run() if isinstance(res.outputs.converted_files, list): lgr.warning( - "Following series likely has multiple orientations: ", + "Following series files likely have multiple orientations: %s", item_dicoms ) for idx, fl in enumerate(res.outputs.converted_files): @@ -559,97 +586,66 @@ def convert(items, symlink=True, converter=None, shutil.rmtree(tmpdir) -def convert_dicoms(subjs, dicom_dir_template, outdir, heuristic_file, converter, - queue=None, anon_sid_cmd=None, anon_outdir=None, with_prov=False, - ses=None, is_bids=False): - for sid in subjs: - tmpdir = None +def convert_dicoms(sid, + dicoms, + outdir, + heuristic, + converter, + queue=None, + anon_sid_cmd=None, anon_outdir=None, with_prov=False, + ses=None, + is_bids=False): + if True: # just to minimize diff for now, remove later and dedent + # + # TODO: Also better lives outside and just replicates all cmdline args? + # if queue: # TODO This needs to be updated to better scale with additional args progname = os.path.abspath(inspect.getfile(inspect.currentframe())) - convertcmd = ' '.join(['python', progname, '-d', dicom_dir_template, - '-o', outdir, '-f', heuristic_file, '-s', sid, + convertcmd = ' '.join(['python', progname, + '-o', outdir, + '-f', heuristic.filename, + '-s', sid, '-c', converter]) if ses: - convertcmd += " --ses " + str(ses) + convertcmd += " --ses '%s'" % ses if with_prov: convertcmd += " --with-prov" if is_bids: convertcmd += " --bids" + convertcmd += ["'%s'" % f for f in dicoms] + script_file = 'dicom-%s.sh' % sid with open(script_file, 'wt') as fp: fp.writelines(['#!/bin/bash\n', convertcmd]) - outcmd = 'sbatch -J dicom-%s -p %s -N1 -c2 --mem=20G %s' % (sid, queue, script_file) + outcmd = 'sbatch -J dicom-%s -p %s -N1 -c2 --mem=20G %s' \ + % (sid, queue, script_file) os.system(outcmd) - continue + return - # TODO: RF into a function - # expand the input template - if sid: - sdir = dicom_dir_template % sid - # and see what matches - fl = sorted(glob(sdir)) - else: - # we were given no subject so we consider dicom_dir_template to be - # and actual directory to process - if not isdir(dicom_dir_template): - raise ValueError( - "No subject id was provided, and dicom_dir_template=%s is not " - "an existing directory to traverse" % str(dicom_dir_template) - ) - fl = sorted(find_files('.*', topdir=dicom_dir_template)) - - lgr.info("Processing %d dicoms", len(fl)) - dcmsessions = None - # some people keep compressed tarballs around -- be nice! - if len(fl) and tarfile.is_tarfile(fl[0]): - # check if we got a list of tarfiles - if not len(fl) == sum([tarfile.is_tarfile(i) for i in fl]): - raise ValueError("some but not all input files are tar files") - # tarfiles already know what they contain, and often the filenames - # are unique, or at least in a unqiue subdir per session - # strategy: extract everything in a temp dir and assemble a list - # of all files in all tarballs - content = [] - tmpdir = mkdtemp(prefix='heudiconvtmp') - # needs sorting to keep the generated "session" label deterministic - for i, t in enumerate(sorted(fl)): - tf = tarfile.open(t) - # check content and sanitize permission bits - tmembers = tf.getmembers() - for tm in tmembers: - tm.mode = 0o700 - # get all files, assemble full path in tmp dir - tf_content = [m.name for m in tmembers if m.isfile()] - content += tf_content - if len(fl) > 1: - # more than one tarball, take care of "session" indicator - if dcmsessions is None: - dcmsessions = [] - dcmsessions += ([i] * len(tf_content)) - # extract into tmp dir - tf.extractall(path=tmpdir, members=tmembers) - fl = content - - #dcmfile = dcm.read_file(fl[0], force=True) - #print sid, 'Dicom: ', dcmfile.PatientName, sid == dcmfile.PatientName + dicoms = dicoms + (lgr.info if dicoms else lgr.error)("Processing %d dicoms", len(dicoms)) + + # in this reimplementation we can have only a single session assigned + # at this point + # dcmsessions = seqinfo = None # we might need it earlier than later if not sid: # figure out the sid out of available information - seqinfo, filegroup = process_dicoms( - fl, dcmsessions, basedir=tmpdir, - dcmfilter=getattr(mod, 'filter_dicom', None)) + seqinfo, filegroup = group_dicoms_into_seqinfos( + dicoms, + dcmfilter=getattr(heuristic, 'filter_dicom', None)) # XXX session information handling is somewhat backwards since done above # already. Moreover above logic with .edit.txt file -- seqinfo is # available only on initial run - TODO - if not hasattr(mod, 'get_session_subject_id'): + if not hasattr(heuristic, 'get_session_subject_id'): raise ValueError( "%s has no get_session_subject_id needed to figure out " - "subject/session from DICOMs" % mod + "subject/session from DICOMs" % heuristic ) - session_subject_ids = set(mod.get_session_subject_id(s) + session_subject_ids = set(heuristic.get_session_subject_id(s) for s in seqinfo) assert len(session_subject_ids) == 1, \ "atm we support processing only 1 subject/session at a time" @@ -666,7 +662,7 @@ def convert_dicoms(subjs, dicom_dir_template, outdir, heuristic_file, converter, if anon_outdir is None: anon_outdir = outdir - # Figure out where to stick supplemental info files + # Figure out where to stick supplemental info dicoms idir = os.path.join(outdir, sid) if is_bids and ses: idir = os.path.join(idir, 'ses-%s' % str(ses)) @@ -676,33 +672,33 @@ def convert_dicoms(subjs, dicom_dir_template, outdir, heuristic_file, converter, if not os.path.exists(idir): os.makedirs(idir) - shutil.copy(heuristic_file, idir) - path, fname = os.path.split(heuristic_file) - sys.path.append(path) - mod = __import__(fname.split('.')[0]) + shutil.copy(heuristic.filename, idir) + ses_suffix = "_ses-%s" % ses if ses is not None else "" + info_file = os.path.join(idir, '%s%s.auto.txt' % (sid, ses_suffix)) + edit_file = os.path.join(idir, '%s%s.edit.txt' % (sid, ses_suffix)) + filegroup_file = os.path.join(idir, 'filegroup%s.json' % ses_suffix) - infofile = os.path.join(idir, '%s.auto.txt' % sid) - editfile = os.path.join(idir, '%s.edit.txt' % sid) - if os.path.exists(editfile): # XXX may be condition on seqinfo is None + if os.path.exists(edit_file): # XXX may be condition on seqinfo is None lgr.info("Reloading existing filegroup.json because %s exists", - editfile) - info = read_config(editfile) - filegroup = load_json(os.path.join(idir, 'filegroup.json')) + edit_file) + info = read_config(edit_file) + filegroup = load_json(filegroup_file) else: if seqinfo is None: - seqinfo, filegroup = process_dicoms( - fl, dcmsessions, basedir=tmpdir, - dcmfilter=getattr(mod, 'filter_dicom', None)) + seqinfo, filegroup = group_dicoms_into_seqinfos( + dicoms, + dcmfilter=getattr(heuristic, 'filter_dicom', None)) else: lgr.debug("DICOMS were already processed, reusing that info") - save_json(os.path.join(idir, 'filegroup.json'), filegroup) - with open(os.path.join(idir, 'dicominfo.txt'), 'wt') as fp: + save_json(filegroup_file, filegroup) + dicominfo_file = os.path.join(idir, 'dicominfo%s.tsv' % ses_suffix) + with open(dicominfo_file, 'wt') as fp: for seq in seqinfo: fp.write('\t'.join([str(val) for val in seq]) + '\n') - lgr.debug("Calling out to %s.infodict", mod) - info = mod.infotodict(seqinfo) - write_config(infofile, info) - write_config(editfile, info) + lgr.debug("Calling out to %s.infodict", heuristic) + info = heuristic.infotodict(seqinfo) + write_config(info_file, info) + write_config(edit_file, info) # # Conversion @@ -718,24 +714,81 @@ def convert_dicoms(subjs, dicom_dir_template, outdir, heuristic_file, converter, if converter != 'none': lgr.info("Doing conversion using %s", converter) cinfo = conversion_info(anon_sid, tdir, info, filegroup, - basedir=tmpdir, ses=ses) convert(cinfo, converter=converter, scaninfo_suffix=getattr( - mod, 'scaninfo_suffix', '.json'), + heuristic, 'scaninfo_suffix', '.json'), custom_callable=getattr( - mod, 'custom_callable', None), + heuristic, 'custom_callable', None), with_prov=with_prov, is_bids=is_bids, sourcedir=sourcedir) - # - # Cleanup - # - if tmpdir is not None: - # clean up tmp dir with extracted tarball - shutil.rmtree(tmpdir) + +def get_extracted_dicoms(fl): + """Given a list of files, possibly extract some from tarballs + + For 'classical' heudiconv, if multiple tarballs are provided, they correspond + to different sessions, so here we would group into sessions and return + pairs `sessionid`, `files` with `sessionid` being None if no "sessions" + detected for that file or there was just a single tarball in the list + """ + # TODO: bring check back? + # if any(not tarfile.is_tarfile(i) for i in fl): + # raise ValueError("some but not all input files are tar files") + + # tarfiles already know what they contain, and often the filenames + # are unique, or at least in a unqiue subdir per session + # strategy: extract everything in a temp dir and assemble a list + # of all files in all tarballs + tmpdir = tempdirs(prefix='heudiconvtmp') + + sessions = defaultdict(list) + session = 0 + if not isinstance(fl, (list, tuple)): + fl = list(fl) + + # needs sorting to keep the generated "session" label deterministic + for i, t in enumerate(sorted(fl)): + # "classical" heudiconv has that heuristic to handle multiple + # tarballs as providing different sessions per each tarball + if not tarfile.is_tarfile(t): + sessions[None].append(t) + continue # the rest is tarball specific + + tf = tarfile.open(t) + # check content and sanitize permission bits + tmembers = tf.getmembers() + for tm in tmembers: + tm.mode = 0o700 + # get all files, assemble full path in tmp dir + tf_content = [m.name for m in tmembers if m.isfile()] + # store full paths to each file, so we don't need to drag along + # tmpdir as some basedir + sessions[session] = [opj(tmpdir, f) for f in tf_content] + session += 1 + # extract into tmp dir + tf.extractall(path=tmpdir, members=tmembers) + + if session == 1: + # we had only 1 session, so no really multiple sessions according + # to classical 'heudiconv' assumptions, thus just move them all into + # None + sessions[None] += sessions.pop(0) + + return sessions.items() + + +def load_heuristic(heuristic_file): + """Load heuristic from the file, return the module + """ + path, fname = os.path.split(heuristic_file) + sys.path.append(path) + mod = __import__(fname.split('.')[0]) + mod.filename = heuristic_file + return mod + # # Additional handlers @@ -784,14 +837,17 @@ s3 parser.add_argument('--version', action='version', version=__version__) parser.add_argument('-d', '--dicom_dir_template', dest='dicom_dir_template', - required=True, + required=False, help='''location of dicomdir that can be indexed with subject id. Tarballs (can be compressed) are supported in additions to directory. All matching tarballs for a subject are extracted and their content processed in a single pass''') - parser.add_argument('-s', '--subjects', dest='subjs', required=True, - type=str, nargs='+', help='list of subjects') + parser.add_argument('-s', '--subjects', dest='subjs', + type=str, nargs='*', + help='list of subjects. If not provided, DICOMS would ' + 'first be "sorted" and subject IDs deduced by the ' + 'heuristic') parser.add_argument('-c', '--converter', dest='converter', required=True, choices=('mri_convert', 'dcmstack', 'dcm2nii', 'dcm2niix', @@ -823,26 +879,115 @@ s3 parser.add_argument('-p', '--with-prov', dest='with_prov', action='store_true', help='''Store additional provenance information. Requires python-rdflib.''') parser.add_argument('-ss', '--ses', dest='session', default=None, - help='''session for longitudinal studies, default is none''') + help='''session for longitudinal study_sessions, default is none''') parser.add_argument('-b', '--bids', dest='bids', action='store_true', help='''flag for output into BIDS structure''') parser.add_argument('--dbg', action='store_true', dest='debug', help="do not catch exceptions and show exception traceback") + + parser.add_argument( + 'files', + nargs='*', + help="files (tarballs, dicoms) or directories containing files to " + "process. Specify one of the --dicom_dir_template or files " + "not both") + args = parser.parse_args(args) + # TODO: deprecate dicom_dir_template in favor of --files-templated or + # smth like that which could take {subject} {session} ... and process + # files argument(s) correspondingly before passing into group_dicoms_into_seqinfos + + if args.files and args.dicom_dir_template: + raise ValueError("Specify files or dicom_dir_template, not both") + if args.debug: setup_exceptionhook() - convert_dicoms(args.subjs, os.path.abspath(args.dicom_dir_template), - os.path.abspath(args.outputdir), - heuristic_file=os.path.realpath(args.heuristic_file), + # + # Load heuristic -- better do it asap to make sure it loads correctly + # + heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + + # + # Deal with provided files or templates + # + + # + # pre-process provided list of files and possibly sort into groups/sessions + # + + files_groups = {} # ATM feels duplicating insides of convert_dicoms -- TODO: RF + # for now will be just + # Group files per each study/subject/session + study_sessions = {} + + dicom_dir_template = args.dicom_dir_template + files_opt = args.files + session = args.session + subjs = args.subjs + + # Move into a function! + if dicom_dir_template: + dicom_dir_template = os.path.abspath(dicom_dir_template) + assert not files_opt # see above TODO + assert subjs + # expand the input template + if '%s' not in dicom_dir_template: + raise ValueError( + "dicom dir template must have '%s' as a placeholder for a " + "subject id. Got %r" % dicom_dir_template) + for sid in subjs: + sdir = dicom_dir_template % sid + # and see what matches + files = sorted(glob(sdir)) + for session_, files_ in get_extracted_dicoms(files): + if session_ is not None and session: + lgr.warning( + "We had session specified (%s) but while analyzing " + "files got a new value %r (using it instead)" + % (session, session_)) + # in this setup we do not care about tracking "studies" so + # locator would be the same None + study_sessions[ + StudySessionInfo( + None, + session_ if session_ is not None else session, + sid, + )] = files_ + else: + # prep files + assert(files_opt) + files = [] + for f in files_opt: + if isdir(f): + files += sorted(find_files('.*', topdir=f)) + else: + files.append(f) + # in this scenario we don't care about sessions obtained this way + files = [] + for _, files_ in get_extracted_dicoms(args.files): + files += files_ + + # sort all DICOMS using heuristic + raise NotImplementedError() + + # extract tarballs, and replace their entries with expanded lists of files + # TODO: we might need to sort so sessions are ordered??? + for (locator, session, subject), files in study_sessions.items(): + convert_dicoms( + subject, + files, + opj(os.path.abspath(args.outputdir), locator or ''), + heuristic=heuristic, converter=args.converter, queue=args.queue, anon_sid_cmd=args.anon_cmd, anon_outdir=args.conv_outputdir, with_prov=args.with_prov, - ses=args.session, + ses=session, is_bids=args.bids) + tempdirs.cleanup() if __name__ == '__main__': diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 2262041d..ca78d487 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -49,6 +49,8 @@ def infotodict(seqinfo): # lgr.debug("Figured out subject %s. Session prefix: %s", subject_id, session_prefix) # del subject_id # not to be used + # task -- if not specified -- where to get it??? + t1 = create_key('anat', 'T1w', outtype=and_dicom) t2 = create_key('anat', 'T2w', outtype=and_dicom) fm_diff = create_key('fmap', 'fieldmap-dwi') From da386dae76d237ce5de2e86ec1a1214923aa427f Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sat, 8 Oct 2016 23:40:31 -0400 Subject: [PATCH 002/181] ENH: verify that whatever is given from groupping into seqinfo comes from the same study instance uid --- bin/heudiconv | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bin/heudiconv b/bin/heudiconv index 5737de95..6f3d4a7b 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -213,6 +213,10 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None): groups = [[], []] mwgroup = [] + studyUID = None # for sanity check that all DICOMs came from the same + # "study". If not -- what is the use-case? (interrupted acquisition?) + # and how would then we deal with series numbers + # which would differ already for fidx, filename in enumerate(fl): mw = ds.wrapper_from_data(dcm.read_file(filename, force=True)) @@ -222,6 +226,10 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None): except: pass + if studyUID is None: + studyUID = mw.dcm_data.StudyInstanceUID + else: + assert studyUID == mw.dcm_data.StudyInstanceUID try: series_id = (mw.dcm_data.SeriesNumber, mw.dcm_data.ProtocolName) From d88d4485c240773408a3b9d5f07f24ce7a899fba Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sun, 9 Oct 2016 00:34:55 -0400 Subject: [PATCH 003/181] ENH: allow to group using StudyInstanceUID --- bin/heudiconv | 69 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 6f3d4a7b..540b5651 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -184,7 +184,7 @@ def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False) find_files.__doc__ %= (_VCS_REGEX,) -def group_dicoms_into_seqinfos(fl, dcmfilter=None): +def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): """Process list of dicoms and return seqinfo and file group `seqinfo` contains per-sequence extract of fields from DICOMs which @@ -197,6 +197,9 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None): dcmfilter : callable, optional If called on dcm_data and returns True, it is used to set series_id + per_studyUID : bool, optional + Then would add a StudyInstanceUID into study id. So it would not then + generalize across re-runs on new data. Returns ------- @@ -226,24 +229,41 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None): except: pass - if studyUID is None: - studyUID = mw.dcm_data.StudyInstanceUID - else: - assert studyUID == mw.dcm_data.StudyInstanceUID + try: + studyUID_ = mw.dcm_data.StudyInstanceUID + except AttributeError: + #import pdb; pdb.set_trace() + lgr.info("File %s is missing any StudyInstanceUID" % filename) + studyUID_ = None + #continue + try: series_id = (mw.dcm_data.SeriesNumber, mw.dcm_data.ProtocolName) + studyUID_ = mw.dcm_data.StudyInstanceUID + + if not per_studyUID: + # verify that we are working with a single study + if studyUID is None: + studyUID = studyUID_ + else: + assert studyUID == studyUID_ except AttributeError as exc: lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', filename, exc) # not a normal DICOM -> ignore series_id = (-1, 'none') + studyUID_ = None if not series_id[0] < 0: if dcmfilter is not None and dcmfilter(mw.dcm_data): series_id = (-1, mw.dcm_data.ProtocolName) + if per_studyUID: + series_id = series_id + (studyUID,) + if not groups: + raise RuntimeError("Yarik really thinks this is never ran!") # yoh: I don't think this would ever be executed! mwgroup.append(mw) groups[0].append(series_id) @@ -252,7 +272,7 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None): # filter out unwanted non-image-data DICOMs by assigning # a series number < 0 (see test below) - if not series_id[1] < 0 and mw.dcm_data[0x0008, 0x0016].repval in ( + if not series_id[0] < 0 and mw.dcm_data[0x0008, 0x0016].repval in ( 'Raw Data Storage', 'GrayscaleSoftcopyPresentationStateStorage'): series_id = (-1, mw.dcm_data.ProtocolName) @@ -262,6 +282,8 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None): same = mw.is_same_series(mwgroup[idx]) #print idx, same, groups[idx][0] if same: + # the same series should have the same study uuid + assert mwgroup[idx].dcm_data.get('StudyInstanceUID', None) == studyUID_ ingrp = True if series_id[0] >= 0: series_id = (mwgroup[idx].dcm_data.SeriesNumber, @@ -294,7 +316,7 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None): files = [fl[i] for i, s in enumerate(groups[0]) if s == series_id] # turn the series_id into a human-readable string -- string is needed # for JSON storage later on - series_id = '%i-%s' % series_id + series_id = '-'.join(map(str, series_id)) if series_id in filegroup: raise RuntimeError("Already processed series %r" % series_id) filegroup[series_id] = files @@ -691,6 +713,12 @@ def convert_dicoms(sid, edit_file) info = read_config(edit_file) filegroup = load_json(filegroup_file) + # XXX Yarik finally understood why basedir was dragged along! + # So we could reuse the same PATHs definitions possibly consistent + # across re-runs... BUT that wouldn't work anyways if e.g. + # DICOMs dumped with SOP UUIDs thus differing across runs etc + # So either it would need to be brought back or reconsidered altogether + # (since no sample data to test on etc) else: if seqinfo is None: seqinfo, filegroup = group_dicoms_into_seqinfos( @@ -808,7 +836,8 @@ def is_interactive(): return sys.stdin.isatty() and sys.stdout.isatty() and sys.stderr.isatty() -_sys_excepthook = sys.excepthook # Just in case we ever need original one +_sys_excepthook = sys.excepthook # Just in case we ever need original one + def setup_exceptionhook(): """Overloads default sys.excepthook with our exceptionhook handler. @@ -819,15 +848,13 @@ def setup_exceptionhook(): def _pdb_excepthook(type, value, tb): if is_interactive(): - import traceback, pdb + import traceback + import pdb traceback.print_exception(type, value, tb) print() pdb.post_mortem(tb) else: lgr.warn("We cannot setup exception hook since not in interactive mode") - # we are in interactive mode or we don't have a tty-like - # device, so we call the default hook - #sys.__excepthook__(type, value, tb) _sys_excepthook(type, value, tb) sys.excepthook = _pdb_excepthook @@ -935,7 +962,7 @@ s3 session = args.session subjs = args.subjs - # Move into a function! + # TODO: Move into a function! if dicom_dir_template: dicom_dir_template = os.path.abspath(dicom_dir_template) assert not files_opt # see above TODO @@ -966,22 +993,32 @@ s3 else: # prep files assert(files_opt) + assert(not subjs) files = [] for f in files_opt: if isdir(f): files += sorted(find_files('.*', topdir=f)) else: files.append(f) + # in this scenario we don't care about sessions obtained this way - files = [] - for _, files_ in get_extracted_dicoms(args.files): - files += files_ + files_ = [] + for _, files_ex in get_extracted_dicoms(files): + files_ += files_ex # sort all DICOMS using heuristic + seqinfo, filegroup = group_dicoms_into_seqinfos( + files_, + dcmfilter=getattr(heuristic, 'filter_dicom', None), + per_studyUID=True) + import pdb; pdb.set_trace() + + # now we need to take those seqinfos and group into per locator/subject/session raise NotImplementedError() # extract tarballs, and replace their entries with expanded lists of files # TODO: we might need to sort so sessions are ordered??? + lgr.info("Need to process %d study sessions", len(study_sessions)) for (locator, session, subject), files in study_sessions.items(): convert_dicoms( subject, From 6bef1ed6eee57cd49791e78f647e4c8750e2e504 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sun, 9 Oct 2016 01:03:47 -0400 Subject: [PATCH 004/181] RF: make group_dicom_into_seqinfos return a dict now that we use named tuple, easy to make it into a key. For file groups though we would need to unpair them, but it all just makes "alignment" easier and allows to group per studyUID easier --- bin/heudiconv | 85 ++++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 540b5651..516dc086 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -27,6 +27,7 @@ import tarfile from collections import namedtuple from collections import defaultdict +from collections import OrderedDict as ordereddict from os.path import isdir import logging @@ -44,7 +45,7 @@ SeqInfo = namedtuple( 'SeqInfo', ['total_files_till_now', # 0 'example_dcm_file', # 1 - 'series_number', # 2 + 'series_id', # 2 'unspecified1', # 3 'unspecified2', # 4 'unspecified3', # 5 @@ -259,11 +260,9 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): if dcmfilter is not None and dcmfilter(mw.dcm_data): series_id = (-1, mw.dcm_data.ProtocolName) - if per_studyUID: - series_id = series_id + (studyUID,) - if not groups: raise RuntimeError("Yarik really thinks this is never ran!") + # if I was wrong -- then per_studyUID might need to go above # yoh: I don't think this would ever be executed! mwgroup.append(mw) groups[0].append(series_id) @@ -276,6 +275,11 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): 'Raw Data Storage', 'GrayscaleSoftcopyPresentationStateStorage'): series_id = (-1, mw.dcm_data.ProtocolName) + + if per_studyUID: + series_id = series_id + (studyUID_,) + + #print fidx, N, filename ingrp = False for idx in range(len(mwgroup)): @@ -288,6 +292,8 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): if series_id[0] >= 0: series_id = (mwgroup[idx].dcm_data.SeriesNumber, mwgroup[idx].dcm_data.ProtocolName) + if per_studyUID: + series_id = series_id + (studyUID_,) groups[0].append(series_id) groups[1].append(idx) @@ -299,8 +305,8 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): group_map = dict(zip(groups[0], groups[1])) total = 0 - filegroup = {} - seqinfo = [] + seqinfo = ordereddict() + # for the next line to make any sense the series_id needs to # be sortable in a way that preserves the series order for series_id, mwidx in sorted(group_map.items()): @@ -316,10 +322,12 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): files = [fl[i] for i, s in enumerate(groups[0]) if s == series_id] # turn the series_id into a human-readable string -- string is needed # for JSON storage later on + if per_studyUID: + studyUID = series_id[2] + series_id = series_id[:2] + series_id = '-'.join(map(str, series_id)) - if series_id in filegroup: - raise RuntimeError("Already processed series %r" % series_id) - filegroup[series_id] = files + size = list(mw.image_shape) + [len(files)] total += size[-1] if len(size) < 4: @@ -348,7 +356,7 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): dcminfo.StudyDescription, dcminfo.ReferringPhysicianName, dcminfo.SeriesDescription, - dcminfo.ImageType, + tuple(dcminfo.ImageType), ) # candidates # dcminfo.AccessionNumber @@ -358,17 +366,26 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): # dcminfo.PatientsAge # dcminfo.PatientsSex lgr.debug("%30s %27s %5s nref=%-2d nsrc=%-2d %s" % ( - info.series_number, + info.series_id, dcminfo.SeriesDescription, info.is_derived, len(dcminfo.get('ReferencedImageSequence', '')), len(dcminfo.get('SourceImageSequence', '')), info.image_type )) - seqinfo.append(info) + if per_studyUID: + if studyUID not in seqinfo: + seqinfo[studyUID] = ordereddict() + seqinfo[studyUID][info] = files + else: + seqinfo[info] = files - lgr.info("Generated sequence info with %d entries", len(seqinfo)) - return seqinfo, filegroup + if per_studyUID: + lgr.info("Generated sequence info for %d studies with %d entries total", + len(seqinfo), sum(map(len, seqinfo.values()))) + else: + lgr.info("Generated sequence info with %d entries", len(seqinfo)) + return seqinfo def write_config(outfile, info): @@ -660,27 +677,6 @@ def convert_dicoms(sid, # at this point # dcmsessions = - seqinfo = None # we might need it earlier than later - if not sid: - # figure out the sid out of available information - seqinfo, filegroup = group_dicoms_into_seqinfos( - dicoms, - dcmfilter=getattr(heuristic, 'filter_dicom', None)) - # XXX session information handling is somewhat backwards since done above - # already. Moreover above logic with .edit.txt file -- seqinfo is - # available only on initial run - TODO - if not hasattr(heuristic, 'get_session_subject_id'): - raise ValueError( - "%s has no get_session_subject_id needed to figure out " - "subject/session from DICOMs" % heuristic - ) - - session_subject_ids = set(heuristic.get_session_subject_id(s) - for s in seqinfo) - assert len(session_subject_ids) == 1, \ - "atm we support processing only 1 subject/session at a time" - sess, sid = session_subject_ids.pop() - # # Annonimization # @@ -720,12 +716,13 @@ def convert_dicoms(sid, # So either it would need to be brought back or reconsidered altogether # (since no sample data to test on etc) else: - if seqinfo is None: - seqinfo, filegroup = group_dicoms_into_seqinfos( - dicoms, - dcmfilter=getattr(heuristic, 'filter_dicom', None)) - else: - lgr.debug("DICOMS were already processed, reusing that info") + # TODO -- might have been done outside already! + seqinfo_dict = group_dicoms_into_seqinfos( + dicoms, + dcmfilter=getattr(heuristic, 'filter_dicom', None)) + seqinfo = list(seqinfo_dict.keys()) + filegroup = {si.series_id: x for si, x in seqinfo_dict.items()} + save_json(filegroup_file, filegroup) dicominfo_file = os.path.join(idir, 'dicominfo%s.tsv' % ses_suffix) with open(dicominfo_file, 'wt') as fp: @@ -1007,12 +1004,16 @@ s3 files_ += files_ex # sort all DICOMS using heuristic + # TODO: this one is not groupping by StudyUID but may be we should! seqinfo, filegroup = group_dicoms_into_seqinfos( files_, dcmfilter=getattr(heuristic, 'filter_dicom', None), per_studyUID=True) - import pdb; pdb.set_trace() + if not getattr(heuristic, 'sort_seqinfo', None): + raise NotImplementedError("For now, if no subj template is provided, requiring heuristic to have sort_seqinfo") + heuristic.sort_seqinfo(seqinfo) + import pdb; pdb.set_trace() # now we need to take those seqinfos and group into per locator/subject/session raise NotImplementedError() From 2727baef3e5591622b8f838dfed934fcefdd4f25 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sun, 9 Oct 2016 01:05:24 -0400 Subject: [PATCH 005/181] RF: make group_dicom_into_seqinfos return a dict now that we use named tuple, easy to make it into a key. For file groups though we would need to unpair them, but it all just makes "alignment" easier and allows to group per studyUID easier --- bin/heudiconv | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 516dc086..48644e64 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1005,14 +1005,16 @@ s3 # sort all DICOMS using heuristic # TODO: this one is not groupping by StudyUID but may be we should! - seqinfo, filegroup = group_dicoms_into_seqinfos( + seqinfo_dict = group_dicoms_into_seqinfos( files_, dcmfilter=getattr(heuristic, 'filter_dicom', None), per_studyUID=True) if not getattr(heuristic, 'sort_seqinfo', None): raise NotImplementedError("For now, if no subj template is provided, requiring heuristic to have sort_seqinfo") - heuristic.sort_seqinfo(seqinfo) + + for studyUID, seqinfo in seqinfo_dict.items(): + heuristic.sort_seqinfo(seqinfo) import pdb; pdb.set_trace() # now we need to take those seqinfos and group into per locator/subject/session raise NotImplementedError() From 1995cde39d2012bd51c9f91ca849a567213708a1 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sun, 9 Oct 2016 16:50:43 -0400 Subject: [PATCH 006/181] ENH+RF: first somewhat working dbic bids heuristic r=9; out=../outputs-all-reran$r; rm -rf $out; HEUDICONV_LOGLEVEL=DEBUG bin/heudiconv --dbg -f heuristics/dbic_bids.py -c dcm2niix -o $out -b ../dartmouth-phantoms/bids_test3 --- bin/heudiconv | 216 +++++++++++++++++--------- heuristics/dbic_bids.py | 333 +++++++++++++++++++++++++++++----------- 2 files changed, 386 insertions(+), 163 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 48644e64..51373936 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -60,6 +60,7 @@ SeqInfo = namedtuple( 'referring_physician_name', 'series_description', 'image_type', + 'accession_number', ] ) @@ -93,7 +94,9 @@ class TempDirs(object): self.cleanup() def cleanup(self): + lgr.info("Removing %d temporary directories", len(self.dirs)) for t in self.dirs[:]: + lgr.debug("Removing %s", t) self.rmtree(t) def rmtree(self, tmpdir): @@ -357,6 +360,7 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): dcminfo.ReferringPhysicianName, dcminfo.SeriesDescription, tuple(dcminfo.ImageType), + dcminfo.AccessionNumber, ) # candidates # dcminfo.AccessionNumber @@ -407,6 +411,8 @@ def conversion_info(subject, outdir, info, filegroup, ses=None): continue template = key[0] outtype = key[1] + # So no annotation_classes of any kind! so if not used -- what was the + # intension???? XXX outpath = outdir for idx, itemgroup in enumerate(items): if not isinstance(itemgroup, list): @@ -554,6 +560,8 @@ def convert(items, symlink=True, converter=None, 'dcm2niix': Dcm2niix}[converter](), name='convert') convertnode.base_dir = tmpdir + # need to be abspaths! + item_dicoms = map(os.path.abspath, item_dicoms) convertnode.inputs.source_names = item_dicoms if converter == 'dcm2nii': convertnode.inputs.gzip_output = outtype == 'nii.gz' @@ -641,12 +649,22 @@ def convert_dicoms(sid, queue=None, anon_sid_cmd=None, anon_outdir=None, with_prov=False, ses=None, - is_bids=False): + is_bids=False, + seqinfo=None): if True: # just to minimize diff for now, remove later and dedent # # TODO: Also better lives outside and just replicates all cmdline args? # if queue: + if seqinfo and not dicoms: + # flatten them all and provide into batching, which again + # would group them... heh + dicoms = sum(seqinfo.values(), []) + # so + raise NotImplementedError( + "we already groupped them so need to add a switch to avoid " + "any groupping, so no outdir prefix doubled etc" + ) # TODO This needs to be updated to better scale with additional args progname = os.path.abspath(inspect.getfile(inspect.currentframe())) convertcmd = ' '.join(['python', progname, @@ -670,8 +688,12 @@ def convert_dicoms(sid, os.system(outcmd) return - dicoms = dicoms - (lgr.info if dicoms else lgr.error)("Processing %d dicoms", len(dicoms)) + if dicoms: + lgr.info("Processing %d dicoms", len(dicoms)) + elif seqinfo: + lgr.info("Processing %d pre-sorted seqinfo entries", len(seqinfo)) + else: + raise ValueError("neither dicoms nor seqinfo dict was provided") # in this reimplementation we can have only a single session assigned # at this point @@ -717,9 +739,13 @@ def convert_dicoms(sid, # (since no sample data to test on etc) else: # TODO -- might have been done outside already! - seqinfo_dict = group_dicoms_into_seqinfos( - dicoms, - dcmfilter=getattr(heuristic, 'filter_dicom', None)) + if dicoms: + seqinfo_dict = group_dicoms_into_seqinfos( + dicoms, + dcmfilter=getattr(heuristic, 'filter_dicom', None)) + else: + # TODO: rename to avoid ambiguity etc + seqinfo_dict = seqinfo seqinfo = list(seqinfo_dict.keys()) filegroup = {si.series_id: x for si, x in seqinfo_dict.items()} @@ -823,6 +849,98 @@ def load_heuristic(heuristic_file): return mod +def get_study_sessions(dicom_dir_template, files_opt, heuristic, outputdir, + session, subjs): + """Given options from cmdline sort files or dicom seqinfos into + study_sessions which put together files for a single session of a subject + in a study + + Two major possible workflows: + - if dicom_dir_template provided -- doesn't pre-load DICOMs and just + loads files pointed by each subject and possibly sessions as corresponding + to different tarballs + - if files_opt is provided, sorts all DICOMs it can find under those paths + """ + study_sessions = {} + if dicom_dir_template: + dicom_dir_template = os.path.abspath(dicom_dir_template) + assert not files_opt # see above TODO + assert subjs + # expand the input template + if '%s' not in dicom_dir_template: + raise ValueError( + "dicom dir template must have '%s' as a placeholder for a " + "subject id. Got %r" % dicom_dir_template) + for sid in subjs: + sdir = dicom_dir_template % sid + # and see what matches + files = sorted(glob(sdir)) + for session_, files_ in get_extracted_dicoms(files): + if session_ is not None and session: + lgr.warning( + "We had session specified (%s) but while analyzing " + "files got a new value %r (using it instead)" + % (session, session_)) + # in this setup we do not care about tracking "studies" so + # locator would be the same None + study_sessions[ + StudySessionInfo( + None, + session_ if session_ is not None else session, + sid, + )] = files_ + else: + # prep files + assert (files_opt) + assert (not subjs) + files = [] + for f in files_opt: + if isdir(f): + files += sorted(find_files('.*', topdir=f)) + else: + files.append(f) + + # in this scenario we don't care about sessions obtained this way + files_ = [] + for _, files_ex in get_extracted_dicoms(files): + files_ += files_ex + + # sort all DICOMS using heuristic + # TODO: this one is not groupping by StudyUID but may be we should! + seqinfo_dict = group_dicoms_into_seqinfos( + files_, + dcmfilter=getattr(heuristic, 'filter_dicom', None), + per_studyUID=True) + + if not getattr(heuristic, 'infotoids', None): + raise NotImplementedError( + "For now, if no subj template is provided, requiring " + "heuristic to have infotoids") + + for studyUID, seqinfo in seqinfo_dict.items(): + # so we have a single study, we need to figure out its + # locator, session, subject + # TODO: Try except to ignore those we can't handle? + # actually probably there should be a dedicated exception for + # heuristics to throw if they detect that the study they are given + # is not the one they would be willing to work on + ids = heuristic.infotoids(seqinfo.keys(), outputdir=outputdir) + # TODO: probably infotoids is doomed to do more and possibly + # split into multiple sessions!!!! but then it should be provided + # full seqinfo with files which it would place into multiple groups + study_session_info = StudySessionInfo( + ids.get('locator'), + ids.get('session', session), + ids.get('subject', None)) + if study_session_info in study_sessions: + raise ValueError( + "We already have a study session with the same value %s" + % study_session_info) + study_sessions[study_session_info] = seqinfo + + return study_sessions + + # # Additional handlers # @@ -949,84 +1067,38 @@ s3 # pre-process provided list of files and possibly sort into groups/sessions # - files_groups = {} # ATM feels duplicating insides of convert_dicoms -- TODO: RF # for now will be just # Group files per each study/subject/session - study_sessions = {} dicom_dir_template = args.dicom_dir_template files_opt = args.files session = args.session subjs = args.subjs + outputdir = os.path.abspath(args.outputdir) # TODO: Move into a function! - if dicom_dir_template: - dicom_dir_template = os.path.abspath(dicom_dir_template) - assert not files_opt # see above TODO - assert subjs - # expand the input template - if '%s' not in dicom_dir_template: - raise ValueError( - "dicom dir template must have '%s' as a placeholder for a " - "subject id. Got %r" % dicom_dir_template) - for sid in subjs: - sdir = dicom_dir_template % sid - # and see what matches - files = sorted(glob(sdir)) - for session_, files_ in get_extracted_dicoms(files): - if session_ is not None and session: - lgr.warning( - "We had session specified (%s) but while analyzing " - "files got a new value %r (using it instead)" - % (session, session_)) - # in this setup we do not care about tracking "studies" so - # locator would be the same None - study_sessions[ - StudySessionInfo( - None, - session_ if session_ is not None else session, - sid, - )] = files_ - else: - # prep files - assert(files_opt) - assert(not subjs) - files = [] - for f in files_opt: - if isdir(f): - files += sorted(find_files('.*', topdir=f)) - else: - files.append(f) - - # in this scenario we don't care about sessions obtained this way - files_ = [] - for _, files_ex in get_extracted_dicoms(files): - files_ += files_ex - - # sort all DICOMS using heuristic - # TODO: this one is not groupping by StudyUID but may be we should! - seqinfo_dict = group_dicoms_into_seqinfos( - files_, - dcmfilter=getattr(heuristic, 'filter_dicom', None), - per_studyUID=True) - - if not getattr(heuristic, 'sort_seqinfo', None): - raise NotImplementedError("For now, if no subj template is provided, requiring heuristic to have sort_seqinfo") - - for studyUID, seqinfo in seqinfo_dict.items(): - heuristic.sort_seqinfo(seqinfo) - import pdb; pdb.set_trace() - # now we need to take those seqinfos and group into per locator/subject/session - raise NotImplementedError() - - # extract tarballs, and replace their entries with expanded lists of files + study_sessions = get_study_sessions(dicom_dir_template, files_opt, + heuristic, outputdir, session, subjs)# extract tarballs, and replace their entries with expanded lists of files # TODO: we might need to sort so sessions are ordered??? lgr.info("Need to process %d study sessions", len(study_sessions)) - for (locator, session, subject), files in study_sessions.items(): + for (locator, session, subject), files_or_seqinfo in study_sessions.items(): + + if not len(files_or_seqinfo): + raise ValueError("nothing to process?") + # that is how life is ATM :-/ since we don't do sorting if subj + # template is provided + if isinstance(files_or_seqinfo, dict): + assert(isinstance(list(files_or_seqinfo.keys())[0], SeqInfo)) + files = None + seqinfo = files_or_seqinfo + else: + files = files_or_seqinfo + seqinfo = None + convert_dicoms( subject, files, - opj(os.path.abspath(args.outputdir), locator or ''), + opj(outputdir, locator or ''), heuristic=heuristic, converter=args.converter, queue=args.queue, @@ -1034,7 +1106,9 @@ s3 anon_outdir=args.conv_outputdir, with_prov=args.with_prov, ses=session, - is_bids=args.bids) + is_bids=args.bids, + seqinfo=seqinfo) + tempdirs.cleanup() diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index ca78d487..5547fc55 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -11,11 +11,14 @@ def create_key(subdir, file_suffix, outtype=('nii.gz',), if not subdir: raise ValueError('subdir must be a valid format string') # may be even add "performing physician" if defined?? - template = "{referring_physician_name}/{study_description}/{bids_subject_session_dir}/" \ + template = "{bids_subject_session_dir}/" \ "%s/{bids_subject_session_prefix}_%s" % (subdir, file_suffix) return template, outtype, annotation_classes +# XXX we killed session indicator! what should we do now?!!! +# WE DON:T NEED IT -- it will be provided into conversion_info as `session` +# So we just need subdir and file_suffix! def infotodict(seqinfo): """Heuristic evaluator for determining which runs belong where @@ -30,105 +33,114 @@ def infotodict(seqinfo): lgr.info("Processing %d seqinfo entries", len(seqinfo)) and_dicom = ('dicom', 'nii.gz') - # decide on global prefix (TODO -- manage to do it outside of here) - - # no checks for consistency for now -- assume that those fields we rely on - # ARE the same - # Actually all below is taken care of outside now! - # s = seqinfo[0] - # - # session_id, subject_id = get_session_subject_id(s) - # - # session_suffix = session_prefix = '' - # if session_id: - # session_suffix = "_ses-%s" % session_id - # session_prefix = "ses-%s/" % session_id - # if not subject_id: - # raise ValueError("Could not figure out the subject id") - # - # lgr.debug("Figured out subject %s. Session prefix: %s", subject_id, session_prefix) - # del subject_id # not to be used - - # task -- if not specified -- where to get it??? - - t1 = create_key('anat', 'T1w', outtype=and_dicom) - t2 = create_key('anat', 'T2w', outtype=and_dicom) - fm_diff = create_key('fmap', 'fieldmap-dwi') - dwi_ap = create_key('dwi', 'dir-AP_dwi', outtype=and_dicom) - dwi_pa = create_key('dwi', 'dir-PA_dwi', outtype=and_dicom) - fm_rest = create_key('fmap', 'fieldmap-rest') - rs = create_key('func', 'task-rest_run-{item:02d}_bold', outtype=and_dicom) - boldt1 = create_key('func', 'task-bird1back_run-{item:02d}_bold', outtype=and_dicom) - boldt2 = create_key('func', 'task-letter1back_run-{item:02d}_bold', outtype=and_dicom) - boldt3 = create_key('func', 'task-letter2back_run-{item:02d}_bold', outtype=and_dicom) - nofb_task = create_key('func', 'task-nofb_run-{item:02d}_bold', outtype=and_dicom) - fb_task = create_key('func', 'task-fb_run-{item:02d}_bold', outtype=and_dicom) - #info = {t1: [], t2: [], fm_diff:[], dwi_ap:[], dwi_pa:[], fm_rest:[], rs:[], - # boldt1:[], boldt2:[], boldt3:[], nofb_task:[], fb_task:[]} info = defaultdict(list) - last_run = len(seqinfo) skipped, skipped_unknown = [], [] + current_run = 0 + run_label = None # run- for s in seqinfo: template = None suffix = '' seq = [] - # figure out type of image from s.image_info - image_dir = { + # figure out type of image from s.image_info -- just for checking ATM + # since we primarily rely on encoded in the protocol name information + image_type_modality = { 'P': 'fmap', 'FMRI': 'func', 'MPR': 'anat', - 'M': 'anat', + # 'M': 'func', -- can be for scout, anat, bold 'DIFFUSION': 'dwi', + 'MIP_SAG': 'anat', # angiography + 'MIP_COR': 'anat', # angiography + 'MIP_TRA': 'anat', # angiography }.get(s.image_type[2], None) - if image_dir is None: - # must be exhaustive! - raise ValueError( - "Cannot figure out type of image with image_info %s" - % str(s.image_type) - ) + regd = parse_dbic_protocol_name(s.protocol_name) + + if not regd: + skipped_unknown.append(s.series_id) + continue + + modality = regd.pop('modality') + modality_label = regd.pop('modality_label', None) + + if image_type_modality and modality != image_type_modality: + import pdb; pdb.set_trace() + lgr.warning("Deduced modality to be %s from DICOM, but got %s out of %s", + image_type_modality, modality, s.protocol_name) if s.is_derived: # Let's for now stash those close to original images - image_dir += '/derivative' + # TODO: we might want a separate tree for all of this!? + # so more of a parameter to the create_key + modality += '/derivative' # just keep it lower case and without special characters + # XXXX what for??? seq.append(s.series_description.lower()) - # analyze s.protocol_name (series_number is based on it) for full name mapping etc - if image_dir == 'func': + + # analyze s.protocol_name (series_id is based on it) for full name mapping etc + if modality == 'func' and not modality_label: if '_pace_' in s.protocol_name: - suffix += '_pace' # or should it be part of seq- + modality_label = 'pace' # or should it be part of seq- else: # assume bold by default - suffix += '_bold' - - # TODO run. might be needed for fieldmap - - # .series_description in case of - sdesc = s.study_description - # temporary aliases for those phantoms which we already collected - # so we rename them into this - #MAPPING - - # the idea ias to have sequence names in the format like - # bids__bidsrecord - # in bids record we could have _run[+=] - # which would say to either increment run number from already encountered - # or reuse the last one - if seq: - suffix += 'seq-%s' % ('+'.join(seq)) - - if template: - info[template].append(s.series_number) - else: - # some are ok to skip and not to whine - if "_Scout_" in s.series_description: - skipped.append(s.series_number) - lgr.debug("Ignoring %s", s.series_number) + modality_label = 'bold' + + run = regd.get('run') + if run is not None: + # so we have an indicator for a run + if run == '+': + current_run += 1 + elif run == '=': + pass + elif run.isdigit(): + current_run_ = int(run) + if current_run_ < current_run: + lgr.warning( + "Previous run (%s) was larger than explicitly specified %s", + current_run, current_run_) + current_run = current_run_ else: - skipped_unknown.append(s.series_number) + raise ValueError( + "Don't know how to deal with run specification %s" % repr(run)) + if isinstance(current_run, str) and current_run.isdigit(): + current_run = int(current_run) + run_label = "run-" + ("%02d" % current_run + if isinstance(current_run, int) + else current_run) + + suffix_parts = [ + run_label, + None if not regd.get('task') else "task-%s" % regd['task'], + regd.get('bids'), + modality_label, + ] + # filter tose which are None, and join with _ + suffix = '_'.join(filter(bool, suffix_parts)) + + # # .series_description in case of + # sdesc = s.study_description + # # temporary aliases for those phantoms which we already collected + # # so we rename them into this + # #MAPPING + # + # # the idea ias to have sequence names in the format like + # # bids__bidsrecord + # # in bids record we could have _run[+=] + # # which would say to either increment run number from already encountered + # # or reuse the last one + # if seq: + # suffix += 'seq-%s' % ('+'.join(seq)) + + # some are ok to skip and not to whine + if "_Scout" in s.series_description: + skipped.append(s.series_id) + lgr.debug("Ignoring %s", s.series_id) + else: + template = create_key(modality, suffix) + info[template].append(s.series_id) info = dict(info) # convert to dict since outside functionality depends on it being a basic dict if skipped: @@ -139,20 +151,157 @@ def infotodict(seqinfo): return info -def get_session_subject_id(s): +def get_unique(seqinfos, attr): + """Given a list of seqinfos, which must have come from a single study + get specific attr, which must be unique across all of the entries + + If not -- fail! + + """ + values = set(getattr(si, attr) for si in seqinfos) + assert (len(values) == 1) + return values.pop() + + +# TODO: might need to do groupping per each session and return here multiple +# hits, or may be we could just somehow demarkate that it will be multisession +# one and so then later value parsed (again) in infotodict would be used??? +def infotoids(seqinfos, outputdir): # decide on subjid and session based on patient_id - pid_split = s.patient_id.split('_') - if len(pid_split) == 1: - # there were no explicit session - # then it is not a multi-session study - sid = s.patient_id - session_id = None - elif len(pid_split) == 2: - sid, session_id = pid_split - elif len(pid_split) == 3: - _nonanon_sid, session_id, sid = pid_split - else: - raise ValueError( - "No logic for more than 3 _-separated entries in patient_id. Got:" - " %s" % s.patient_id) - return session_id, sid + lgr.info("Processing sequence infos to deduce study/session") + study_description = get_unique(seqinfos, 'study_description') + subject = get_unique(seqinfos, 'patient_id') + locator = study_description.replace('^', '/') + + # TODO: actually check if given study is study we would care about + # and if not -- we should throw some ???? exception + + # So -- use `outputdir` and locator etc to see if for a given locator/subject + # and possible ses+ in the sequence names, so we would provide a sequence + # So might need to go through parse_dbic_protocol_name(s.protocol_name) + # to figure out presence of sessions. + ses_markers = [ + parse_dbic_protocol_name(s.protocol_name).get('session', None) for s in seqinfos + ] + ses_markers = filter(bool, ses_markers) # only present ones + + session = None + if ses_markers: + # we have a session or possibly more than one even + # let's figure out which case we have + nonsign_vals = set(ses_markers).difference('+=') + if nonsign_vals: + if set(ses_markers).intersection('+='): + raise NotImplementedError( + "Should not mix hardcoded session markers with incremental ones (+=)" + ) + # although we might want an explicit '=' to note the same session as + # mentioned before? + if len(nonsign_vals) > 1: + raise NotImplementedError( + "Cannot deal with multiple sessions in the same study yet!") + assert len(ses_markers) == 1 + session = ses_markers[0] + else: + # TODO - I think we are doomed to go through the sequence and split + # ... actually the same as with nonsign_vals, we just would need to figure + # out initial one if sign ones, and should make use of knowing + # outputdir + #raise NotImplementedError() + # Let's be lazy for now just to get somewhere + session = '001' + + return { + # TODO: request info on study from the JedCap + 'locator': locator, + # Sessions to be deduced yet from the names etc TODO + 'session': session, + 'subject': subject, + } + + +def parse_dbic_protocol_name(protocol_name): + """Parse protocol name + """ + + # Parse the name according to our convention + # https://docs.google.com/document/d/1R54cgOe481oygYVZxI7NHrifDyFUZAjOBwCTu7M7y48/edit?usp=sharing + import re + + bids_regex = re.compile( + r""" + bids_ # our prefix to signal BIDS layout + (?P[^-_]+)(-(?P[^-_]+))? # modality + (_ses(?P([+=]|-[^-_]+)))? # session + (_run(?P([+=]|-[^-_]+)))? # run + (_task-(?P[^-_]+))? # task + (?P(_[^_]+)+)? # more of _ separated items for generic BIDS + (__.*?)? # some custom suffix which will not be included anywhere + """, + flags=re.X + ) + + reg = bids_regex.match(protocol_name) + + if not reg: + lgr.debug("Did not match protocol %s as DBIC BIDS protocol", + protocol_name) + return {} + regd = reg.groupdict() + + # pop those which were not found (i.e None) + for k in list(regd.keys()): + if regd[k] is None: + regd.pop(k) + + for f in 'run', 'session': + # strip leading - in values + if f in regd: + regd[f] = regd[f].lstrip('-') + + # strip leading _ for consistency + if regd.get('bids', None) is not None: + regd['bids'] = regd['bids'].lstrip('_') + + # TODO: might want to check for all known "standard" BIDS suffixes here + + # if not regd.get('modality_label', None): + # # might need to assign a default label for each modality if was not + # # given + # regd['modality_label'] = { + # 'func': 'bold' + # }.get(regd['modality'], None) + + return regd + + +def test_parse_dbic_protocol_name(): + pdpn = parse_dbic_protocol_name + + assert pdpn("nondbic_func-bold") == {} + + assert pdpn("bids_func-bold") == \ + {'modality': 'func', 'modality_label': 'bold'} + + assert pdpn("bids_func_ses+_run+_task-boo") == \ + { + 'modality': 'func', 'modality_label': 'bold', + 'session': '+', + 'run': '+', + 'task': 'boo', + } + assert pdpn("bids_func-pace_ses-1_run-2_task-boo_bids-please__therest") == \ + { + 'modality': 'func', 'modality_label': 'pace', + 'session': '1', + 'run': '2', + 'task': 'boo', + 'bids': 'bids-please' + } + + assert pdpn("bids_anat-scout_ses+") == \ + { + 'modality': 'anat', + 'modality_label': 'scout', + 'session': '+', + } \ No newline at end of file From 4004b44e9dfe82e3c27ccee6533ebc8be3abe480 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 11 Oct 2016 21:34:27 -0400 Subject: [PATCH 007/181] ENH: dbic-bids heuristics adjustment, dump into sourcedata mimicing hierarchy of BIDS, safeguards against writing over --- bin/heudiconv | 93 +++++++++++++++++++---------- heuristics/dbic_bids.py | 128 +++++++++++++++++++++++++++------------- 2 files changed, 150 insertions(+), 71 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 51373936..1f5ab83a 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -30,6 +30,8 @@ from collections import defaultdict from collections import OrderedDict as ordereddict from os.path import isdir +PY3 = sys.version_info[0] >= 3 + import logging lgr = logging.getLogger('heudiconv') # Rudimentary logging support. If you want it better -- we need to have @@ -369,9 +371,10 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): # FOR demographics # dcminfo.PatientsAge # dcminfo.PatientsSex - lgr.debug("%30s %27s %5s nref=%-2d nsrc=%-2d %s" % ( + lgr.debug("%30s %27s %27s %5s nref=%-2d nsrc=%-2d %s" % ( info.series_id, dcminfo.SeriesDescription, + dcminfo.ProtocolName, info.is_derived, len(dcminfo.get('ReferencedImageSequence', '')), len(dcminfo.get('SourceImageSequence', '')), @@ -442,11 +445,9 @@ def conversion_info(subject, outdir, info, filegroup, ses=None): try: files = filegroup[item] except KeyError: - files = filegroup[unicode(item)] - + files = filegroup[(str if PY3 else unicode)(item)] outprefix = template.format(**parameters) convert_info.append((os.path.join(outpath, outprefix), outtype, files)) - return convert_info @@ -481,19 +482,36 @@ def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False): def compress_dicoms(dicom_list, prefix, sourcedir): tmpdir = mkdtemp(prefix='dicomtar') outtar = os.path.join(sourcedir, prefix + '.dicom.tgz') - with tarfile.open(outtar, 'w:gz') as tar: + if os.path.exists(outtar): + raise RuntimeError("File %s already exists, will not override" + % outtar) + with tarfile.open(outtar, 'w:gz', dereference=True) as tar: for filename in dicom_list: outfile = os.path.join(tmpdir, os.path.basename(filename)) if not os.path.islink(outfile): - os.symlink(filename, outfile) - tar.add(outfile, recursive=False) + os.symlink(os.path.realpath(filename), outfile) + # place into archive stripping any lead directories and + # adding the one corresponding to prefix + tar.add(outfile, + arcname=opj(prefix, os.path.basename(outfile)), + recursive=False) tar.close() shutil.rmtree(tmpdir) +def safe_copyfile(src, dest): + """Copy file but blow if destination name already exists + """ + if os.path.isdir(dest): + dest = os.path.join(dest, os.path.basename(src)) + if os.path.lexists(dest): + raise ValueError("was asked to copy %s but destination already exists: %s" + % (src, dest)) + shutil.copyfile(src, dest) + def convert(items, symlink=True, converter=None, scaninfo_suffix='.json', custom_callable=None, with_prov=False, - is_bids=False, sourcedir=None): + is_bids=False, sourcedir=None, outdir=None): prov_files = [] tmpdir = mkdtemp(prefix='heudiconvtmp') for item in items: @@ -510,16 +528,23 @@ def convert(items, symlink=True, converter=None, os.makedirs(dirname) for outtype in outtypes: item_dicoms = item[2] - lgr.info("Processing %d dicoms", len(item_dicoms)) + lgr.info("Processing %d dicoms for output type %s", + len(item_dicoms), outtype) lgr.log(1, " those dicoms are: %s", item_dicoms) if outtype == 'dicom': if is_bids: - if not os.path.exists(sourcedir): - os.makedirs(sourcedir) - dicom_list = [] - for filename in item_dicoms: - dicom_list.append(filename) - compress_dicoms(dicom_list, os.path.basename(prefix), sourcedir) + # mimic the same hierarchy location as the prefix + # although it could all have been done probably + # within heuristic really + sourcedir_ = os.path.join( + sourcedir, + os.path.dirname( + os.path.relpath(prefix, outdir))) + if not os.path.exists(sourcedir_): + os.makedirs(sourcedir_) + compress_dicoms(item_dicoms, + os.path.basename(prefix), + sourcedir_) else: dicomdir = prefix + '_dicom' if os.path.exists(dicomdir): @@ -576,27 +601,31 @@ def convert(items, symlink=True, converter=None, ) for idx, fl in enumerate(res.outputs.converted_files): outname = prefix + '-' + str(idx) + '.' + outtype - shutil.copyfile(fl, outname) + safe_copyfile(fl, outname) else: - shutil.copyfile(res.outputs.converted_files, outname) + safe_copyfile(res.outputs.converted_files, outname) + if isdefined(res.outputs.bvecs): outname_bvecs = prefix + '.bvec' outname_bvals = prefix + '.bval' - shutil.copyfile(res.outputs.bvecs, outname_bvecs) - shutil.copyfile(res.outputs.bvals, outname_bvals) + safe_copyfile(res.outputs.bvecs, outname_bvecs) + safe_copyfile(res.outputs.bvals, outname_bvals) if converter == 'dcm2niix' \ and isdefined(res.outputs.bids): ### extract bids try: - shutil.copyfile(res.outputs.bids, outname_bids) - except TypeError: ##catch lists - lgr.warning("There was someone catching lists!") + safe_copyfile(res.outputs.bids, outname_bids) + except TypeError as exc: ##catch lists + lgr.warning( + "There was someone catching lists!: %s", + exc + ) continue if with_prov: prov_file = prefix + '_prov.ttl' - shutil.copyfile(os.path.join(convertnode.base_dir, + safe_copyfile(os.path.join(convertnode.base_dir, convertnode.name, 'provenance.ttl'), prov_file) @@ -711,7 +740,7 @@ def convert_dicoms(sid, anon_outdir = outdir # Figure out where to stick supplemental info dicoms - idir = os.path.join(outdir, sid) + idir = os.path.join(outdir, '.heudiconv', sid) if is_bids and ses: idir = os.path.join(idir, 'ses-%s' % str(ses)) if anon_outdir == outdir: @@ -765,11 +794,13 @@ def convert_dicoms(sid, sourcedir = None if is_bids: - sourcedir = os.path.join(outdir, 'sourcedata', sid) - if ses: - sourcedir = os.path.join(sourcedir, 'ses-%s' % str(ses)) + sourcedir = os.path.join(outdir, 'sourcedata') + # the other portion of the path would mimic BIDS layout + # so we don't need to worry here about sub, ses at all + tdir = anon_outdir + else: + tdir = os.path.join(anon_outdir, anon_sid) - tdir = os.path.join(anon_outdir, anon_sid) if converter != 'none': lgr.info("Doing conversion using %s", converter) cinfo = conversion_info(anon_sid, tdir, info, filegroup, @@ -782,7 +813,8 @@ def convert_dicoms(sid, heuristic, 'custom_callable', None), with_prov=with_prov, is_bids=is_bids, - sourcedir=sourcedir) + sourcedir=sourcedir, + outdir=tdir) def get_extracted_dicoms(fl): @@ -896,7 +928,8 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outputdir, files = [] for f in files_opt: if isdir(f): - files += sorted(find_files('.*', topdir=f)) + files += sorted(find_files( + '.*', topdir=f, exclude_vcs=True, exclude="/\.datalad/")) else: files.append(f) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 5547fc55..583c99a7 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -6,7 +6,7 @@ lgr = logging.getLogger('heudiconv') -def create_key(subdir, file_suffix, outtype=('nii.gz',), +def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), annotation_classes=None): if not subdir: raise ValueError('subdir must be a valid format string') @@ -56,7 +56,15 @@ def infotodict(seqinfo): 'MIP_TRA': 'anat', # angiography }.get(s.image_type[2], None) - regd = parse_dbic_protocol_name(s.protocol_name) + protocol_name_tuned = s.protocol_name + # Few common replacements + if protocol_name_tuned in {'AAHead_Scout'}: + protocol_name_tuned = 'anat-scout' + + regd = parse_dbic_protocol_name(protocol_name_tuned) + + if s.image_type[2].startswith('MIP'): + regd['acq'] = regd.get('acq', '') + s.image_type[2] if not regd: skipped_unknown.append(s.series_id) @@ -66,9 +74,9 @@ def infotodict(seqinfo): modality_label = regd.pop('modality_label', None) if image_type_modality and modality != image_type_modality: - import pdb; pdb.set_trace() - lgr.warning("Deduced modality to be %s from DICOM, but got %s out of %s", - image_type_modality, modality, s.protocol_name) + lgr.warning( + "Deduced modality to be %s from DICOM, but got %s out of %s", + image_type_modality, modality, protocol_name_tuned) if s.is_derived: # Let's for now stash those close to original images @@ -79,10 +87,9 @@ def infotodict(seqinfo): # XXXX what for??? seq.append(s.series_description.lower()) - # analyze s.protocol_name (series_id is based on it) for full name mapping etc if modality == 'func' and not modality_label: - if '_pace_' in s.protocol_name: + if '_pace_' in protocol_name_tuned: modality_label = 'pace' # or should it be part of seq- else: # assume bold by default @@ -112,9 +119,10 @@ def infotodict(seqinfo): else current_run) suffix_parts = [ - run_label, None if not regd.get('task') else "task-%s" % regd['task'], + None if not regd.get('acq') else "acq-%s" % regd['acq'], regd.get('bids'), + run_label, modality_label, ] # filter tose which are None, and join with _ @@ -171,6 +179,7 @@ def infotoids(seqinfos, outputdir): lgr.info("Processing sequence infos to deduce study/session") study_description = get_unique(seqinfos, 'study_description') subject = get_unique(seqinfos, 'patient_id') + # TODO: fix up subject id if missing some 0s locator = study_description.replace('^', '/') # TODO: actually check if given study is study we would care about @@ -228,42 +237,70 @@ def parse_dbic_protocol_name(protocol_name): # https://docs.google.com/document/d/1R54cgOe481oygYVZxI7NHrifDyFUZAjOBwCTu7M7y48/edit?usp=sharing import re - bids_regex = re.compile( - r""" - bids_ # our prefix to signal BIDS layout - (?P[^-_]+)(-(?P[^-_]+))? # modality - (_ses(?P([+=]|-[^-_]+)))? # session - (_run(?P([+=]|-[^-_]+)))? # run - (_task-(?P[^-_]+))? # task - (?P(_[^_]+)+)? # more of _ separated items for generic BIDS - (__.*?)? # some custom suffix which will not be included anywhere - """, - flags=re.X - ) - - reg = bids_regex.match(protocol_name) - - if not reg: - lgr.debug("Did not match protocol %s as DBIC BIDS protocol", - protocol_name) + # TODO -- redo without mandating order of e.g. _run vs _task to go first, + # since BIDS somewhat imposes the order but it doesn't matter. So better be + # flexible -- split first on __ and then on _ within the first field and analyze + # bids_regex = re.compile( + # r""" + # (?P[^-_]+)(-(?P[^-_]+))? # modality + # (_ses(?P([+=]|-[^-_]+)))? # session + # (_run(?P([+=]|-[^-_]+)))? # run + # (_task-(?P[^-_]+))? # task + # (?P(_[^_]+)+)? # more of _ separated items for generic BIDS + # (__.*?)? # some custom suffix which will not be included anywhere + # """, + # flags=re.X + # ) + + # Remove possible suffix we don't care about after __ + protocol_name = protocol_name.split('__', 1)[0] + + bids = None # we don't know yet for sure + # We need to figure out if it is a valid bids + split = protocol_name.split('_') + prefix = split[0] + if prefix != 'bids' and '-' in prefix: + prefix, _ = prefix.split('-', 1) + if prefix == 'bids': + bids = True # for sure + split = split[1:] + + def split2(s): + # split on - if present, if not -- 2nd one returned None + if '-' in s: + return s.split('-', 1) + return s, None + + # Let's analyze first element which should tell us sequence type + modality, modality_label = split2(split[0]) + if modality not in {'anat', 'func', 'dwi', 'behav', 'fmap'}: + # It is not something we don't consume + if bids: + lgr.warning("It was instructed to be BIDS sequence but unknown " + "type %s found", modality) return {} - regd = reg.groupdict() - - # pop those which were not found (i.e None) - for k in list(regd.keys()): - if regd[k] is None: - regd.pop(k) - for f in 'run', 'session': - # strip leading - in values - if f in regd: - regd[f] = regd[f].lstrip('-') + regd = dict(modality=modality) + if modality_label: + regd['modality_label'] = modality_label + # now go through each to see if one which we care + bids_leftovers = [] + for s in split[1:]: + key, value = split2(s) + if value is None and key[-1] in "+=": + value = key[-1] + key = key[:-1] + if key in ['ses', 'run', 'task', 'acq']: + # those we care about explicitly + regd[{'ses': 'session'}.get(key, key)] = value + else: + bids_leftovers.append(s) - # strip leading _ for consistency - if regd.get('bids', None) is not None: - regd['bids'] = regd['bids'].lstrip('_') + if bids_leftovers: + regd['bids'] = '_'.join(bids_leftovers) # TODO: might want to check for all known "standard" BIDS suffixes here + # among bids_leftovers, thus serve some kind of BIDS validator # if not regd.get('modality_label', None): # # might need to assign a default label for each modality if was not @@ -279,23 +316,32 @@ def test_parse_dbic_protocol_name(): pdpn = parse_dbic_protocol_name assert pdpn("nondbic_func-bold") == {} + assert pdpn("cancelme_func-bold") == {} assert pdpn("bids_func-bold") == \ + pdpn("func-bold") == \ {'modality': 'func', 'modality_label': 'bold'} + # pdpn("bids_func_ses+_task-boo_run+") == \ + # order should not matter assert pdpn("bids_func_ses+_run+_task-boo") == \ { - 'modality': 'func', 'modality_label': 'bold', + 'modality': 'func', + # 'modality_label': 'bold', 'session': '+', 'run': '+', 'task': 'boo', } - assert pdpn("bids_func-pace_ses-1_run-2_task-boo_bids-please__therest") == \ + # TODO: fix for that + assert pdpn("bids_func-pace_ses-1_task-boo_acq-bu_bids-please_run-2__therest") == \ + pdpn("bids_func-pace_ses-1_run-2_task-boo_acq-bu_bids-please__therest") == \ + pdpn("func-pace_ses-1_task-boo_acq-bu_bids-please_run-2") == \ { 'modality': 'func', 'modality_label': 'pace', 'session': '1', 'run': '2', 'task': 'boo', + 'acq': 'bu', 'bids': 'bids-please' } From 29ce45f6b854d22e789386a0ffffcd4f56fef843 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 13 Oct 2016 13:05:43 -0400 Subject: [PATCH 008/181] ENH: deal with multiple files generated for fieldmap by passing seqinfo (BK) seqinfo not necessarily matches info in terms of # of items apparently... so will not work as is --- bin/heudiconv | 215 +++++++++++++++++++++++++++------------- heuristics/dbic_bids.py | 30 ++++-- 2 files changed, 166 insertions(+), 79 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 1f5ab83a..898c72e1 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -29,6 +29,8 @@ from collections import namedtuple from collections import defaultdict from collections import OrderedDict as ordereddict from os.path import isdir +from os.path import basename +from os.path import dirname PY3 = sys.version_info[0] >= 3 @@ -244,7 +246,7 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): #continue try: - series_id = (mw.dcm_data.SeriesNumber, + series_id = (int(mw.dcm_data.SeriesNumber), mw.dcm_data.ProtocolName) studyUID_ = mw.dcm_data.StudyInstanceUID @@ -460,11 +462,13 @@ def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False): if len(stack) > 1: raise ValueError('Found multiple series') stack = stack[0] + #Create the nifti image using the data array if not os.path.exists(niftifile): nifti_image = stack.to_nifti(embed_meta=True) nifti_image.to_filename(niftifile) return ds.NiftiWrapper(nifti_image).meta_ext.to_json() + orig_nii = nb.load(niftifile) aff = orig_nii.get_affine() ornt = nb.orientations.io_orientation(aff) @@ -509,28 +513,36 @@ def safe_copyfile(src, dest): % (src, dest)) shutil.copyfile(src, dest) -def convert(items, symlink=True, converter=None, + +def convert(items, seqinfos=None, symlink=True, converter=None, scaninfo_suffix='.json', custom_callable=None, with_prov=False, is_bids=False, sourcedir=None, outdir=None): prov_files = [] tmpdir = mkdtemp(prefix='heudiconvtmp') - for item in items: + assert(not seqinfos or len(items) == len(seqinfos)) + for item_idx, item in enumerate(items): + seqinfo = seqinfos[item_idx] if seqinfos else None if isinstance(item[1], (list, tuple)): outtypes = item[1] else: outtypes = [item[1]] prefix = item[0] - dirname = os.path.dirname(prefix + '.ext') + prefix_dirname = os.path.dirname(prefix + '.ext') + prov_file = None outname_bids = prefix + '.json' + outname_bids_files = [] # actual bids files since dcm2niix might generate multiple ATM lgr.info('Converting %s -> %s . Converter: %s', - prefix, dirname, converter) - if not os.path.exists(dirname): - os.makedirs(dirname) + prefix, prefix_dirname, converter) + if not os.path.exists(prefix_dirname): + os.makedirs(prefix_dirname) for outtype in outtypes: item_dicoms = item[2] lgr.info("Processing %d dicoms for output type %s", len(item_dicoms), outtype) lgr.log(1, " those dicoms are: %s", item_dicoms) + + modality = basename(dirname(prefix)) if is_bids else None + if outtype == 'dicom': if is_bids: # mimic the same hierarchy location as the prefix @@ -542,8 +554,13 @@ def convert(items, symlink=True, converter=None, os.path.relpath(prefix, outdir))) if not os.path.exists(sourcedir_): os.makedirs(sourcedir_) + if modality == 'fmap': + # might have multiple - M and P, so lets suffix accordingly + suffix = {'M': '_magnitude', 'P': '_phase'}[seqinfo.image_type[2]] + else: + suffix = '' compress_dicoms(item_dicoms, - os.path.basename(prefix), + os.path.basename(prefix) + suffix, sourcedir_) else: dicomdir = prefix + '_dicom' @@ -591,19 +608,9 @@ def convert(items, symlink=True, converter=None, if converter == 'dcm2nii': convertnode.inputs.gzip_output = outtype == 'nii.gz' else: - convertnode.inputs.out_filename = os.path.basename(dirname) + convertnode.inputs.out_filename = os.path.basename(prefix_dirname) convertnode.inputs.terminal_output = 'allatonce' res = convertnode.run() - if isinstance(res.outputs.converted_files, list): - lgr.warning( - "Following series files likely have multiple orientations: %s", - item_dicoms - ) - for idx, fl in enumerate(res.outputs.converted_files): - outname = prefix + '-' + str(idx) + '.' + outtype - safe_copyfile(fl, outname) - else: - safe_copyfile(res.outputs.converted_files, outname) if isdefined(res.outputs.bvecs): outname_bvecs = prefix + '.bvec' @@ -611,17 +618,72 @@ def convert(items, symlink=True, converter=None, safe_copyfile(res.outputs.bvecs, outname_bvecs) safe_copyfile(res.outputs.bvals, outname_bvals) - if converter == 'dcm2niix' \ - and isdefined(res.outputs.bids): - ### extract bids - try: - safe_copyfile(res.outputs.bids, outname_bids) - except TypeError as exc: ##catch lists + res_files = res.outputs.converted_files + if isinstance(res_files, list): + # TODO: move into a function + # by default just suffix them up + suffixes = None + # we should provide specific handling for fmap, + # dwi etc which might spit out multiple files + if is_bids: + if modality == 'fmap': + # possible filenames and their mapping to suffixes: + fmap_filenames = { + 'fmap': '_magnitude1', + '_e2fmap': '_magnitude2', + '_e2_ph': '_phasediff', + } + suffixes = [] + for f in res_files: + fbasename = basename(f)[:-(len(outtype) + 1)] + try: + suffixes.append(fmap_filenames[fbasename]) + except KeyError: + raise ValueError("Do not know how to deal with %s output file for fmap" % fbasename) + + if not suffixes: lgr.warning( - "There was someone catching lists!: %s", - exc + "Following series files likely have " + "multiple (%d) volumes (orientations?) " + "generated: %s ...", + len(res_files), item_dicoms[0] ) - continue + suffixes = ['-%d' % i for i in range(len(res_files))] + + # Also copy BIDS files although they might need to be merged/postprocessed later + if converter == 'dcm2niix' and isdefined( res.outputs.bids): + assert(len(res.outputs.bids) == len(res_files)) + bids_files = res.outputs.bids + else: + bids_files = [None] * len(res_files) + + for fl, suffix, bids_file in zip(res_files, suffixes, bids_files): + outname = "%s%s.%s" % (prefix, suffix, outtype) + safe_copyfile(fl, outname) + if bids_file: + outname_bids_file = "%s%s.json" % (prefix, suffix) + safe_copyfile(bids_file, outname_bids_file) + outname_bids_files.append(outname_bids_file) + + else: + safe_copyfile(res_files, outname) + if converter == 'dcm2niix' and isdefined(res.outputs.bids): + try: + safe_copyfile(res.outputs.bids, outname_bids) + outname_bids_files.append(outname_bids) + except TypeError as exc: ##catch lists + lgr.warning( + "There was someone catching lists!: %s", exc + ) + continue + + # TODO: move into a function + # we should provide specific handling for fmap, + # dwi etc .json of which should get merged to satisfy + # BIDS. BUT wer might be somewhat not in time for a + # party here since we sorted into multiple seqinfo + # (e.g. magnitude, phase for fmap so we might want + # to sort them into a single one) if with_prov: prov_file = prefix + '_prov.ttl' @@ -630,46 +692,60 @@ def convert(items, symlink=True, converter=None, 'provenance.ttl'), prov_file) prov_files.append(prov_file) - - #if not is_bids or converter != 'dcm2niix': ##uses dcm2niix's infofile - from nipype import Node, Function - embedfunc = Node(Function(input_names=['dcmfiles', - 'niftifile', - 'infofile', - 'bids_info', - 'force'], - output_names=['outfile', - 'meta'], - function=embed_nifti), - name='embedder') - embedfunc.inputs.dcmfiles = item_dicoms - embedfunc.inputs.niftifile = os.path.abspath(outname) - embedfunc.inputs.infofile = os.path.abspath(scaninfo) - if is_bids and (converter == 'dcm2niix'): - embedfunc.inputs.bids_info = load_json(os.path.abspath(outname_bids)) - else: - embedfunc.inputs.bids_info = None - embedfunc.inputs.force = True - embedfunc.base_dir = tmpdir - cwd = os.getcwd() - try: - res = embedfunc.run() - os.chmod(scaninfo, 0o0440) - if with_prov: - g = res.provenance.rdf() - g.parse(prov_file, - format='turtle') - g.serialize(prov_file, format='turtle') - os.chmod(prov_file, 0o0440) - except: - os.chdir(cwd) - os.chmod(outname, 0o0440) + + if len(outname_bids_files) > 1: + lgr.warning( + "For now not embedding BIDS and info generated .nii.gz itself since sequence produced multiple files") + continue + + #if not is_bids or converter != 'dcm2niix': ##uses dcm2niix's infofile + + embed_metadata_into_nifti(converter, is_bids, item_dicoms, + outname, outname_bids, prov_file, + scaninfo, tmpdir, with_prov) if not custom_callable is None: custom_callable(*item) shutil.rmtree(tmpdir) +def embed_metadata_into_nifti(converter, is_bids, item_dicoms, outname, + outname_bids, prov_file, scaninfo, tmpdir, + with_prov): + from nipype import Node, Function + embedfunc = Node(Function(input_names=['dcmfiles', + 'niftifile', + 'infofile', + 'bids_info', + 'force'], + output_names=['outfile', + 'meta'], + function=embed_nifti), + name='embedder') + embedfunc.inputs.dcmfiles = item_dicoms + embedfunc.inputs.niftifile = os.path.abspath(outname) + embedfunc.inputs.infofile = os.path.abspath(scaninfo) + if is_bids and (converter == 'dcm2niix'): + embedfunc.inputs.bids_info = load_json(os.path.abspath(outname_bids)) + else: + embedfunc.inputs.bids_info = None + embedfunc.inputs.force = True + embedfunc.base_dir = tmpdir + cwd = os.getcwd() + try: + res = embedfunc.run() + os.chmod(scaninfo, 0o0440) + if with_prov: + g = res.provenance.rdf() + g.parse(prov_file, + format='turtle') + g.serialize(prov_file, format='turtle') + os.chmod(prov_file, 0o0440) + except: + os.chdir(cwd) + os.chmod(outname, 0o0440) + + def convert_dicoms(sid, dicoms, outdir, @@ -769,22 +845,19 @@ def convert_dicoms(sid, else: # TODO -- might have been done outside already! if dicoms: - seqinfo_dict = group_dicoms_into_seqinfos( + seqinfo = group_dicoms_into_seqinfos( dicoms, dcmfilter=getattr(heuristic, 'filter_dicom', None)) - else: - # TODO: rename to avoid ambiguity etc - seqinfo_dict = seqinfo - seqinfo = list(seqinfo_dict.keys()) - filegroup = {si.series_id: x for si, x in seqinfo_dict.items()} + seqinfo_list = list(seqinfo.keys()) + filegroup = {si.series_id: x for si, x in seqinfo.items()} save_json(filegroup_file, filegroup) dicominfo_file = os.path.join(idir, 'dicominfo%s.tsv' % ses_suffix) with open(dicominfo_file, 'wt') as fp: - for seq in seqinfo: + for seq in seqinfo_list: fp.write('\t'.join([str(val) for val in seq]) + '\n') lgr.debug("Calling out to %s.infodict", heuristic) - info = heuristic.infotodict(seqinfo) + info = heuristic.infotodict(seqinfo_list) write_config(info_file, info) write_config(edit_file, info) @@ -806,6 +879,7 @@ def convert_dicoms(sid, cinfo = conversion_info(anon_sid, tdir, info, filegroup, ses=ses) convert(cinfo, + seqinfo.keys() if seqinfo else None, # so we have access to information about sequences converter=converter, scaninfo_suffix=getattr( heuristic, 'scaninfo_suffix', '.json'), @@ -940,6 +1014,7 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outputdir, # sort all DICOMS using heuristic # TODO: this one is not groupping by StudyUID but may be we should! + #import pdb; pdb.set_trace() seqinfo_dict = group_dicoms_into_seqinfos( files_, dcmfilter=getattr(heuristic, 'filter_dicom', None), diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 583c99a7..95a07bd0 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -37,7 +37,7 @@ def infotodict(seqinfo): skipped, skipped_unknown = [], [] current_run = 0 run_label = None # run- - + image_data_type = None for s in seqinfo: template = None suffix = '' @@ -45,16 +45,18 @@ def infotodict(seqinfo): # figure out type of image from s.image_info -- just for checking ATM # since we primarily rely on encoded in the protocol name information + prev_image_data_type = image_data_type + image_data_type = s.image_type[2] image_type_modality = { - 'P': 'fmap', + 'P': 'fmap', # phase 'FMRI': 'func', 'MPR': 'anat', - # 'M': 'func', -- can be for scout, anat, bold + # 'M': 'func', "magnitude" -- can be for scout, anat, bold, fmap 'DIFFUSION': 'dwi', 'MIP_SAG': 'anat', # angiography 'MIP_COR': 'anat', # angiography 'MIP_TRA': 'anat', # angiography - }.get(s.image_type[2], None) + }.get(image_data_type, None) protocol_name_tuned = s.protocol_name # Few common replacements @@ -63,8 +65,8 @@ def infotodict(seqinfo): regd = parse_dbic_protocol_name(protocol_name_tuned) - if s.image_type[2].startswith('MIP'): - regd['acq'] = regd.get('acq', '') + s.image_type[2] + if image_data_type.startswith('MIP'): + regd['acq'] = regd.get('acq', '') + image_data_type if not regd: skipped_unknown.append(s.series_id) @@ -99,9 +101,18 @@ def infotodict(seqinfo): if run is not None: # so we have an indicator for a run if run == '+': - current_run += 1 + # some sequences, e.g. fmap, would generate two (or more?) + # sequences -- e.g. one for magnitude(s) and other ones for + # phases. In those we must not increment run! + if image_data_type == 'P': + if prev_image_data_type != 'M': + raise RuntimeError("Was expecting phase image to follow magnitude image, but previous one was %r", prev_image_data_type) + # else we do nothing special + else: # and otherwise we go to the next run + current_run += 1 elif run == '=': - pass + if not current_run: + current_run = 1 elif run.isdigit(): current_run_ = int(run) if current_run_ < current_run: @@ -143,7 +154,8 @@ def infotodict(seqinfo): # suffix += 'seq-%s' % ('+'.join(seq)) # some are ok to skip and not to whine - if "_Scout" in s.series_description: + if "_Scout" in s.series_description or \ + (modality == 'anat' and modality_label == 'scout'): skipped.append(s.series_id) lgr.debug("Ignoring %s", s.series_id) else: From e55166b19a1db4805953220a581aeb44d49dcd2b Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 13 Oct 2016 19:32:04 -0400 Subject: [PATCH 009/181] Somewhat working version laying files out for a sample study which includes fieldmaps also creates template files for dataset descriptor etc --- bin/heudiconv | 160 +++++++++++++++++++++++++++++----------- heuristics/dbic_bids.py | 54 +++++++------- 2 files changed, 144 insertions(+), 70 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 898c72e1..41f5bf4a 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -31,6 +31,7 @@ from collections import OrderedDict as ordereddict from os.path import isdir from os.path import basename from os.path import dirname +from os.path import exists PY3 = sys.version_info[0] >= 3 @@ -514,14 +515,12 @@ def safe_copyfile(src, dest): shutil.copyfile(src, dest) -def convert(items, seqinfos=None, symlink=True, converter=None, +def convert(items, symlink=True, converter=None, scaninfo_suffix='.json', custom_callable=None, with_prov=False, is_bids=False, sourcedir=None, outdir=None): prov_files = [] tmpdir = mkdtemp(prefix='heudiconvtmp') - assert(not seqinfos or len(items) == len(seqinfos)) for item_idx, item in enumerate(items): - seqinfo = seqinfos[item_idx] if seqinfos else None if isinstance(item[1], (list, tuple)): outtypes = item[1] else: @@ -554,13 +553,8 @@ def convert(items, seqinfos=None, symlink=True, converter=None, os.path.relpath(prefix, outdir))) if not os.path.exists(sourcedir_): os.makedirs(sourcedir_) - if modality == 'fmap': - # might have multiple - M and P, so lets suffix accordingly - suffix = {'M': '_magnitude', 'P': '_phase'}[seqinfo.image_type[2]] - else: - suffix = '' compress_dicoms(item_dicoms, - os.path.basename(prefix) + suffix, + os.path.basename(prefix), sourcedir_) else: dicomdir = prefix + '_dicom' @@ -627,20 +621,8 @@ def convert(items, seqinfos=None, symlink=True, converter=None, # dwi etc which might spit out multiple files if is_bids: if modality == 'fmap': - # possible filenames and their mapping to suffixes: - fmap_filenames = { - 'fmap': '_magnitude1', - '_e2fmap': '_magnitude2', - '_e2_ph': '_phasediff', - } - suffixes = [] - for f in res_files: - fbasename = basename(f)[:-(len(outtype) + 1)] - try: - suffixes.append(fmap_filenames[fbasename]) - except KeyError: - raise ValueError("Do not know how to deal with %s output file for fmap" % fbasename) - + # expected! + suffixes = ["%d" % (i+1) for i in range(len(res_files))] if not suffixes: lgr.warning( "Following series files likely have " @@ -648,10 +630,10 @@ def convert(items, seqinfos=None, symlink=True, converter=None, "generated: %s ...", len(res_files), item_dicoms[0] ) - suffixes = ['-%d' % i for i in range(len(res_files))] + suffixes = ['-%d' % (i+1) for i in range(len(res_files))] # Also copy BIDS files although they might need to be merged/postprocessed later - if converter == 'dcm2niix' and isdefined( res.outputs.bids): + if converter == 'dcm2niix' and isdefined(res.outputs.bids): assert(len(res.outputs.bids) == len(res_files)) bids_files = res.outputs.bids else: @@ -677,13 +659,14 @@ def convert(items, seqinfos=None, symlink=True, converter=None, ) continue - # TODO: move into a function - # we should provide specific handling for fmap, - # dwi etc .json of which should get merged to satisfy - # BIDS. BUT wer might be somewhat not in time for a - # party here since we sorted into multiple seqinfo - # (e.g. magnitude, phase for fmap so we might want - # to sort them into a single one) + # Fix up and unify BIDS files + tuneup_bids_json_files(outname_bids_files) + # we should provide specific handling for fmap, + # dwi etc .json of which should get merged to satisfy + # BIDS. BUT wer might be somewhat not in time for a + # party here since we sorted into multiple seqinfo + # (e.g. magnitude, phase for fmap so we might want + # to sort them into a single one) if with_prov: prov_file = prefix + '_prov.ttl' @@ -696,19 +679,58 @@ def convert(items, seqinfos=None, symlink=True, converter=None, if len(outname_bids_files) > 1: lgr.warning( "For now not embedding BIDS and info generated .nii.gz itself since sequence produced multiple files") - continue - - #if not is_bids or converter != 'dcm2niix': ##uses dcm2niix's infofile + else: + #if not is_bids or converter != 'dcm2niix': ##uses dcm2niix's infofile + embed_metadata_into_nifti(converter, is_bids, item_dicoms, + outname, outname_bids, prov_file, + scaninfo, tmpdir, with_prov) + os.chmod(outname, 0o0440) - embed_metadata_into_nifti(converter, is_bids, item_dicoms, - outname, outname_bids, prov_file, - scaninfo, tmpdir, with_prov) - if not custom_callable is None: + if custom_callable is not None: custom_callable(*item) shutil.rmtree(tmpdir) +def tuneup_bids_json_files(json_files): + """Given a list of BIDS .json files, e.g. """ + if not json_files: + return + + # Harmonize generic .json formatting + for jsonfile in json_files: + json_ = json.load(open(jsonfile)) + json.dump(json_, open(jsonfile, 'w'), indent=2) + + # Load the beast + modality = basename(dirname(jsonfile)) + + if modality == 'fmap': + json_basename = '_'.join(jsonfile.split('_')[:-1]) + # if we got by now all needed .json files -- we can fix them up + # unfortunately order of "items" is not guaranteed atm + if len(glob(json_basename + '*.json')) == 3: + json_phasediffname = json_basename + '_phasediff.json' + json_ = json.load(open(json_phasediffname)) + # TODO: we might want to reorder them since ATM + # the one for shorter TE is the 2nd one! + # For now just save truthfully by loading magnitude files + lgr.info("Placing EchoTime fields into phasediff file") + for i in 1, 2: + json_['EchoTime%d' % i] = \ + json.load(open(json_basename + '_magnitude%d.json' % i))[ + 'EchoTime'] + # might have been made R/O already + os.chmod(json_phasediffname, 0o0660) + json.dump(json_, open(json_phasediffname, 'w'), indent=2) + os.chmod(json_phasediffname, 0o0440) + + # phasediff one should contain two PhaseDiff's + # -- one for original amplitude and the other already replicating what is there + # so let's load json files for magnitudes and + # place them into phasediff + + def embed_metadata_into_nifti(converter, is_bids, item_dicoms, outname, outname_bids, prov_file, scaninfo, tmpdir, with_prov): @@ -743,7 +765,7 @@ def embed_metadata_into_nifti(converter, is_bids, item_dicoms, outname, os.chmod(prov_file, 0o0440) except: os.chdir(cwd) - os.chmod(outname, 0o0440) + def convert_dicoms(sid, @@ -819,6 +841,8 @@ def convert_dicoms(sid, idir = os.path.join(outdir, '.heudiconv', sid) if is_bids and ses: idir = os.path.join(idir, 'ses-%s' % str(ses)) + # yoh: in my case if idir exists, it means that that study/subject/session + # is already processed if anon_outdir == outdir: # if all goes into a single dir, have a dedicated 'info' subdir idir = os.path.join(idir, 'info') @@ -879,7 +903,6 @@ def convert_dicoms(sid, cinfo = conversion_info(anon_sid, tdir, info, filegroup, ses=ses) convert(cinfo, - seqinfo.keys() if seqinfo else None, # so we have access to information about sequences converter=converter, scaninfo_suffix=getattr( heuristic, 'scaninfo_suffix', '.json'), @@ -1059,6 +1082,42 @@ def is_interactive(): return sys.stdin.isatty() and sys.stdout.isatty() and sys.stderr.isatty() +def populate_bids_templates(path): + # dataset descriptor + descriptor = opj(path, 'dataset_description.json') + if not exists(descriptor): + save_json(descriptor, + ordereddict([ + ('Name', "TODO: name of the dataset"), + ('BIDSVersion', "1.0.1"), + ('License', "TODO: choose a license, e.g. PDDL (http://opendatacommons.org/licenses/pddl/)"), + ('Authors', ["TODO:", "First1 Last1", "First2 Last2", "..."]), + ('Acknowledgements', "TODO: who should be acknowledge in helping to collect the data"), + ('HowToAcknowledge', "TODO: describe how to acknowledge -- either cite a corresponding paper, or just in acknowledgement section"), + ('Funding', ["TODO", "GRANT #1", "GRANT #2"]), + ('ReferencesAndLinks', ["TODO", "List of papers or websites"]), + ('DatasetDOI', 'TODO: eventually a DOI for the dataset') + ])) + + sourcedata_README = opj(path, 'sourcedata', 'README') + if exists(dirname(sourcedata_README)) and not exists(sourcedata_README): + with open(sourcedata_README, 'w') as f: + f.write("""\ +TODO: Provide description about source data, e.g. + +Directory below contains DICOMS compressed into tarballs per each sequence, +replicating directory hierarchy of the BIDS dataset itself. +""") + + README = opj(path, 'README') + if not exists(README): + with open(README, 'w') as f: + f.write("""\ +TODO: Provide description for the dataset -- basic details about the study, +possibly pointing to pre-registration (if public or embargoed) +""") + + _sys_excepthook = sys.excepthook # Just in case we ever need original one @@ -1189,6 +1248,9 @@ s3 heuristic, outputdir, session, subjs)# extract tarballs, and replace their entries with expanded lists of files # TODO: we might need to sort so sessions are ordered??? lgr.info("Need to process %d study sessions", len(study_sessions)) + + processed_studydirs = set() + for (locator, session, subject), files_or_seqinfo in study_sessions.items(): if not len(files_or_seqinfo): @@ -1203,10 +1265,15 @@ s3 files = files_or_seqinfo seqinfo = None + study_outputdir = opj(outputdir, locator or '') + + # TODO: --datalad cmdline option, which would take care about initiating + # the outputdir -> study_outputdir datasets if not yet there + convert_dicoms( subject, files, - opj(outputdir, locator or ''), + study_outputdir, heuristic=heuristic, converter=args.converter, queue=args.queue, @@ -1216,6 +1283,15 @@ s3 ses=session, is_bids=args.bids, seqinfo=seqinfo) + processed_studydirs.add(outputdir) + + if args.bids: + # Let's populate BIDS templates for folks to take care about + for study_outputdir in processed_studydirs: + populate_bids_templates(study_outputdir) + + # TODO: record_collection of the subject/session although that information + # is pretty much present in .heudiconv/SUBJECT/info so we could just poke there tempdirs.cleanup() diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 95a07bd0..dd6afec9 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -7,12 +7,16 @@ def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), - annotation_classes=None): + annotation_classes=None, prefix=''): if not subdir: raise ValueError('subdir must be a valid format string') # may be even add "performing physician" if defined?? - template = "{bids_subject_session_dir}/" \ - "%s/{bids_subject_session_prefix}_%s" % (subdir, file_suffix) + template = os.path.join( + prefix, + "{bids_subject_session_dir}", + subdir, + "{bids_subject_session_prefix}_%s" % file_suffix + ) return template, outtype, annotation_classes @@ -84,10 +88,13 @@ def infotodict(seqinfo): # Let's for now stash those close to original images # TODO: we might want a separate tree for all of this!? # so more of a parameter to the create_key - modality += '/derivative' + #modality += '/derivative' # just keep it lower case and without special characters # XXXX what for??? - seq.append(s.series_description.lower()) + #seq.append(s.series_description.lower()) + prefix = os.path.join('derivatives', 'scanner') + else: + prefix = '' # analyze s.protocol_name (series_id is based on it) for full name mapping etc if modality == 'func' and not modality_label: @@ -97,6 +104,12 @@ def infotodict(seqinfo): # assume bold by default modality_label = 'bold' + if modality == 'fmap' and not modality_label: + modality_label = { + 'M': 'magnitude', # might want explicit {file_index} ? + 'P': 'phasediff' + }[image_data_type] + run = regd.get('run') if run is not None: # so we have an indicator for a run @@ -159,7 +172,7 @@ def infotodict(seqinfo): skipped.append(s.series_id) lgr.debug("Ignoring %s", s.series_id) else: - template = create_key(modality, suffix) + template = create_key(modality, suffix, prefix=prefix) info[template].append(s.series_id) info = dict(info) # convert to dict since outside functionality depends on it being a basic dict @@ -211,16 +224,18 @@ def infotoids(seqinfos, outputdir): # we have a session or possibly more than one even # let's figure out which case we have nonsign_vals = set(ses_markers).difference('+=') + # although we might want an explicit '=' to note the same session as + # mentioned before? + if len(nonsign_vals) > 1: + lgr.warning( #raise NotImplementedError( + "Cannot deal with multiple sessions in the same study yet!" + " We will process until the end of the first session" + ) if nonsign_vals: if set(ses_markers).intersection('+='): raise NotImplementedError( "Should not mix hardcoded session markers with incremental ones (+=)" ) - # although we might want an explicit '=' to note the same session as - # mentioned before? - if len(nonsign_vals) > 1: - raise NotImplementedError( - "Cannot deal with multiple sessions in the same study yet!") assert len(ses_markers) == 1 session = ses_markers[0] else: @@ -247,23 +262,6 @@ def parse_dbic_protocol_name(protocol_name): # Parse the name according to our convention # https://docs.google.com/document/d/1R54cgOe481oygYVZxI7NHrifDyFUZAjOBwCTu7M7y48/edit?usp=sharing - import re - - # TODO -- redo without mandating order of e.g. _run vs _task to go first, - # since BIDS somewhat imposes the order but it doesn't matter. So better be - # flexible -- split first on __ and then on _ within the first field and analyze - # bids_regex = re.compile( - # r""" - # (?P[^-_]+)(-(?P[^-_]+))? # modality - # (_ses(?P([+=]|-[^-_]+)))? # session - # (_run(?P([+=]|-[^-_]+)))? # run - # (_task-(?P[^-_]+))? # task - # (?P(_[^_]+)+)? # more of _ separated items for generic BIDS - # (__.*?)? # some custom suffix which will not be included anywhere - # """, - # flags=re.X - # ) - # Remove possible suffix we don't care about after __ protocol_name = protocol_name.split('__', 1)[0] From dba4e07d287af568871af36458cd7f6920ecc733 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 13 Oct 2016 20:53:32 -0400 Subject: [PATCH 010/181] ENH: generate participant file etc --- bin/heudiconv | 111 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 11 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 41f5bf4a..8609c0c5 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -66,6 +66,8 @@ SeqInfo = namedtuple( 'series_description', 'image_type', 'accession_number', + 'patient_age', + 'patient_sex', ] ) @@ -366,14 +368,15 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): dcminfo.SeriesDescription, tuple(dcminfo.ImageType), dcminfo.AccessionNumber, + # For demographics to populate BIDS participants.tsv + dcminfo.PatientsAge, + dcminfo.PatientsSex, ) # candidates # dcminfo.AccessionNumber # len(dcminfo.ReferencedImageSequence) # len(dcminfo.SourceImageSequence) # FOR demographics - # dcminfo.PatientsAge - # dcminfo.PatientsSex lgr.debug("%30s %27s %27s %5s nref=%-2d nsrc=%-2d %s" % ( info.series_id, dcminfo.SeriesDescription, @@ -1082,6 +1085,15 @@ def is_interactive(): return sys.stdin.isatty() and sys.stdout.isatty() and sys.stderr.isatty() +def create_file_if_missing(filename, content): + """Create file if missing, so we do not override any possibly introduced changes""" + if exists(filename): + return False + with open(filename, 'w') as f: + f.write(content) + return True + + def populate_bids_templates(path): # dataset descriptor descriptor = opj(path, 'dataset_description.json') @@ -1092,7 +1104,7 @@ def populate_bids_templates(path): ('BIDSVersion', "1.0.1"), ('License', "TODO: choose a license, e.g. PDDL (http://opendatacommons.org/licenses/pddl/)"), ('Authors', ["TODO:", "First1 Last1", "First2 Last2", "..."]), - ('Acknowledgements', "TODO: who should be acknowledge in helping to collect the data"), + ('Acknowledgements', "We thank Terry Sacket and the rest of the DBIC (Dartmouth Brain Imaging Center) personnel for assistance in data collection. TODO: more"), ('HowToAcknowledge', "TODO: describe how to acknowledge -- either cite a corresponding paper, or just in acknowledgement section"), ('Funding', ["TODO", "GRANT #1", "GRANT #2"]), ('ReferencesAndLinks', ["TODO", "List of papers or websites"]), @@ -1100,24 +1112,94 @@ def populate_bids_templates(path): ])) sourcedata_README = opj(path, 'sourcedata', 'README') - if exists(dirname(sourcedata_README)) and not exists(sourcedata_README): - with open(sourcedata_README, 'w') as f: - f.write("""\ + if exists(dirname(sourcedata_README)): + create_file_if_missing( + sourcedata_README, + """\ TODO: Provide description about source data, e.g. Directory below contains DICOMS compressed into tarballs per each sequence, replicating directory hierarchy of the BIDS dataset itself. """) - README = opj(path, 'README') - if not exists(README): - with open(README, 'w') as f: - f.write("""\ + create_file_if_missing( + opj(path, 'CHANGES'), + """\ +0.0.1 Initial data acquired + +TODOs: + - verify and possibly extend information in participants.tsv + (see for example http://datasets.datalad.org/?dir=/openfmri/ds000208) + - fill out dataset_description.json, README, sourcedata/README (if present) + - provide _events.tsv file for each _bold.nii.gz with onsets of events + (see "8.5 Task events" of BIDS specification) +""") + + create_file_if_missing( + opj(path, 'README'), + """\ TODO: Provide description for the dataset -- basic details about the study, possibly pointing to pre-registration (if public or embargoed) """) +def add_participant_record(studydir, subject, age, sex): + participants_tsv = opj(studydir, 'participants.tsv') + participant_id = 'sub-%s' % subject + + if not create_file_if_missing( + participants_tsv, + '\t'.join(['participant_id', 'age', 'sex', 'group']) + '\n' + ): + # check if may be subject record already exists + with open(participants_tsv) as f: + f.readline() + known_subjects = {l.split('\t')[0] for l in f.readline()} + if participant_id in known_subjects: + # already there -- not adding + return + + # Add a new participant + with open(participants_tsv, 'a') as f: + f.write('\t'.join(map(str, [ + participant_id, + age.lstrip('0').rstrip('Y'), + sex, + 'control'])) + + '\n') + + +def prepare_for_datalad(topdir, studydir): + """Do all necessary preparations (if were not done before) and save + """ + from datalad.api import create + from datalad.api import Dataset + + studyrelpath = os.path.relpath(studydir, topdir) + assert not studyrelpath.startswith(os.path.pardir) # so we are under + # now we need to test and initiate a DataLad dataset all along the path + curdir = topdir + for subdir in [''] + studyrelpath.split(os.path.sep): + curdir = opj(curdir, subdir) + ds = Dataset(curdir) + if not ds.is_installed(): + lgr.info("Initiating %s", ds) + ds_ = create(curdir, force=True) + assert ds == ds_ + assert ds.is_installed() + + create_file_if_missing( + opj(studydir, '.gitattributes'), + """\ +*annex.largefiles = (largerthan = 100kb) +*.json annex.largefiles = nothing +*.txt annex.largefiles = nothing +*.tsv annex.largefiles = nothing +""") + + ds.save + + _sys_excepthook = sys.excepthook # Just in case we ever need original one @@ -1283,7 +1365,14 @@ s3 ses=session, is_bids=args.bids, seqinfo=seqinfo) - processed_studydirs.add(outputdir) + + if args.bids and seqinfo: + add_participant_record(study_outputdir, + subject, + seqinfo.keys()[0].patient_age, + seqinfo.keys()[0].patient_sex, + ) + processed_studydirs.add(study_outputdir) if args.bids: # Let's populate BIDS templates for folks to take care about From f886e9281fe4f2fefd7df7d0d7b169809dc703c1 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 13 Oct 2016 20:57:22 -0400 Subject: [PATCH 011/181] ENH: let's try to use pytest --- .coveragerc | 4 ++++ .travis.yml | 3 ++- pytest.ini | 2 ++ tests/test_main.py | 13 +++++++------ 4 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 .coveragerc create mode 100644 pytest.ini diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..32298e55 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[run] +include = tests/* + bin/* + setup.py diff --git a/.travis.yml b/.travis.yml index 648d768c..56d897a8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,7 +27,8 @@ before_install: script: - - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests + # - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests + - coverage run `which py.test` -s -v tests after_success: - codecov diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..df3eb518 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --doctest-modules diff --git a/tests/test_main.py b/tests/test_main.py index 1a496b2e..1808e9b1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -2,18 +2,19 @@ from mock import patch from six.moves import StringIO -from nose.tools import assert_raises, assert_equal - +import pytest from . import heudiconv @patch('sys.stdout', new_callable=StringIO) def test_main_help(stdout): - assert_raises(SystemExit, heudiconv.main, ['--help']) - assert(stdout.getvalue().startswith("usage: ")) + with pytest.raises(SystemExit): + heudiconv.main(['--help']) + assert stdout.getvalue().startswith("usage: ") @patch('sys.stderr' if sys.version_info[:2] <= (3, 3) else 'sys.stdout', new_callable=StringIO) def test_main_version(std): - assert_raises(SystemExit, heudiconv.main, ['--version']) - assert_equal(std.getvalue().rstrip(), heudiconv.__version__) + with pytest.raises(SystemExit): + heudiconv.main(['--version']) + assert std.getvalue().rstrip() == heudiconv.__version__ From da8e8b5b1b57b8fb53c18cb6b2cc5c87502c3339 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 13 Oct 2016 21:16:57 -0400 Subject: [PATCH 012/181] ENH+BF: few basic tests and a little bugfix already ;-) --- bin/heudiconv | 2 +- tests/test_main.py | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 8609c0c5..2ae84f18 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1154,7 +1154,7 @@ def add_participant_record(studydir, subject, age, sex): # check if may be subject record already exists with open(participants_tsv) as f: f.readline() - known_subjects = {l.split('\t')[0] for l in f.readline()} + known_subjects = {l.split('\t')[0] for l in f.readlines()} if participant_id in known_subjects: # already there -- not adding return diff --git a/tests/test_main.py b/tests/test_main.py index 1808e9b1..62d635e1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,8 +1,11 @@ +import pytest import sys from mock import patch +from os.path import exists from six.moves import StringIO -import pytest + + from . import heudiconv @@ -18,3 +21,41 @@ def test_main_version(std): with pytest.raises(SystemExit): heudiconv.main(['--version']) assert std.getvalue().rstrip() == heudiconv.__version__ + + +def test_create_file_if_missing(tmpdir): + tf = tmpdir.join("README.txt") + assert not tf.exists() + heudiconv.create_file_if_missing(str(tf), "content") + assert tf.exists() + assert tf.read() == "content" + heudiconv.create_file_if_missing(str(tf), "content2") + # nothing gets changed + assert tf.read() == "content" + + +def test_populate_bids_templates(tmpdir): + heudiconv.populate_bids_templates(str(tmpdir)) + for f in "README", "dataset_description.json", "CHANGES": + # Just test that we have created them and they all have stuff TODO + assert "TODO" in tmpdir.join(f).read() + + +def test_add_participant_record(tmpdir): + tf = tmpdir.join('participants.tsv') + assert not tf.exists() + heudiconv.add_participant_record(str(tmpdir), "sub01", "023Y", "M") + # should create the file and place corrected record + sub01 = tf.read() + assert sub01 == """\ +participant_id age sex group +sub-sub01 23 M control +""" + heudiconv.add_participant_record(str(tmpdir), "sub01", "023Y", "F") + assert tf.read() == sub01 # nothing was added even though differs in values + heudiconv.add_participant_record(str(tmpdir), "sub02", "2", "F") + assert tf.read() == """\ +participant_id age sex group +sub-sub01 23 M control +sub-sub02 2 F control +""" From b16081e52b2a6caf53a39aec9577bfee708435e5 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 13 Oct 2016 21:46:31 -0400 Subject: [PATCH 013/181] TST+BF: testing initialization of datalad dataset there --- .travis.yml | 5 ++++- bin/heudiconv | 17 ++++++++++------- tests/test_main.py | 33 ++++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 56d897a8..53b53cd8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,10 @@ before_install: - echo '' > requirements.txt - pip install -r dev-requirements.txt - pip install codecov - + # The ultimate one-liner setup for NeuroDebian repository + - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) + - travis_retry sudo apt-get update -qq + - travis_retry sudo apt-get install git-annex-standalone python-datalad script: # - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests diff --git a/bin/heudiconv b/bin/heudiconv index 2ae84f18..fee89d5d 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1184,20 +1184,23 @@ def prepare_for_datalad(topdir, studydir): ds = Dataset(curdir) if not ds.is_installed(): lgr.info("Initiating %s", ds) - ds_ = create(curdir, force=True) + ds_ = create(curdir, force=True, annex_version=6) assert ds == ds_ assert ds.is_installed() create_file_if_missing( opj(studydir, '.gitattributes'), """\ -*annex.largefiles = (largerthan = 100kb) -*.json annex.largefiles = nothing -*.txt annex.largefiles = nothing -*.tsv annex.largefiles = nothing +* annex.largefiles=(largerthan=100kb) +*.json annex.largefiles=nothing +*.txt annex.largefiles=nothing +*.tsv annex.largefiles=nothing """) - - ds.save + # so for mortals it just looks like a regular directory! + ds.config.add('annex.thin', 'true', where='local') + # Let's make it a + dstop = Dataset(topdir) + dstop.save(auto_add_changes=True, recursive=True) _sys_excepthook = sys.excepthook # Just in case we ever need original one diff --git a/tests/test_main.py b/tests/test_main.py index 62d635e1..03edcf6c 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,8 +1,9 @@ +import os import pytest import sys from mock import patch -from os.path import exists +from os.path import join as opj from six.moves import StringIO @@ -59,3 +60,33 @@ def test_add_participant_record(tmpdir): sub-sub01 23 M control sub-sub02 2 F control """ + + +def test_prepare_for_datalad(tmpdir): + pytest.importorskip("datalad") + studydir = tmpdir.join("PI").join("study") + studydir_ = str(studydir) + os.makedirs(studydir_) + heudiconv.populate_bids_templates(studydir_) + + heudiconv.prepare_for_datalad(str(tmpdir), studydir_) + + from datalad.api import Dataset + superds = Dataset(str(tmpdir)) + + assert superds.is_installed() + assert not superds.repo.dirty + subdss = superds.get_subdatasets(recursive=True) + for ds_path in sorted(subdss): + ds = Dataset(opj(superds.path, ds_path)) + assert ds.is_installed() + assert not ds.repo.dirty + + # the last one should have been the study + target_files = { + '.gitattributes', '.datalad/config', 'dataset_description.json', + 'CHANGES', 'README'} + assert set(ds.repo.get_indexed_files()) == target_files + # and all are under git + for f in target_files: + assert not ds.repo.is_under_annex(f) \ No newline at end of file From 6f17ecfc5fa930eeb238f10045319636ac9689c6 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 13 Oct 2016 22:31:13 -0400 Subject: [PATCH 014/181] ENH: notes on annex/datalad, try to install datalad via pip on elderly precise --- .travis.yml | 3 ++- bin/heudiconv | 23 ++++++++++++++++++++++- tests/test_main.py | 2 +- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 53b53cd8..da2d71d0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,7 +27,8 @@ before_install: # The ultimate one-liner setup for NeuroDebian repository - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - travis_retry sudo apt-get update -qq - - travis_retry sudo apt-get install git-annex-standalone python-datalad + - travis_retry sudo apt-get install git-annex-standalone + - pip install datalad script: # - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests diff --git a/bin/heudiconv b/bin/heudiconv index fee89d5d..88c026d5 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1169,7 +1169,7 @@ def add_participant_record(studydir, subject, age, sex): + '\n') -def prepare_for_datalad(topdir, studydir): +def add_to_datalad(topdir, studydir, bids=False): """Do all necessary preparations (if were not done before) and save """ from datalad.api import create @@ -1202,6 +1202,19 @@ def prepare_for_datalad(topdir, studydir): dstop = Dataset(topdir) dstop.save(auto_add_changes=True, recursive=True) + # TODO: they are still appearing as native annex symlinked beasts + """ + TODOs: + it needs + - unlock (thin will be in effect) + - save/commit (does modechange 120000 => 100644 + + - we should mark dicoms and anatomicals as distribution-restricted + - could potentially somehow automate that all: + http://git-annex.branchable.com/tips/automatically_adding_metadata/ + - possibly even make separate sub-datasets for originaldata, derivatives ? + """ + _sys_excepthook = sys.excepthook # Just in case we ever need original one @@ -1284,6 +1297,12 @@ s3 help='''session for longitudinal study_sessions, default is none''') parser.add_argument('-b', '--bids', dest='bids', action='store_true', help='''flag for output into BIDS structure''') + parser.add_argument('--datalad', dest='datalad', action='store_true', + help='''Store the entire collection as DataLad dataset(s). + Small files will be committed directly to git, while large to annex. + New version (6) of annex repositories will be used in a "thin" + mode so it would look to mortals as just any other regular directory + (i.e. no symlinks to under .git/annex). For now just for BIDS mode.''') parser.add_argument('--dbg', action='store_true', dest='debug', help="do not catch exceptions and show exception traceback") @@ -1381,6 +1400,8 @@ s3 # Let's populate BIDS templates for folks to take care about for study_outputdir in processed_studydirs: populate_bids_templates(study_outputdir) + if args.datalad: + add_to_datalad(outputdir, study_outputdir, bids=args.bids) # TODO: record_collection of the subject/session although that information # is pretty much present in .heudiconv/SUBJECT/info so we could just poke there diff --git a/tests/test_main.py b/tests/test_main.py index 03edcf6c..69a264cc 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -69,7 +69,7 @@ def test_prepare_for_datalad(tmpdir): os.makedirs(studydir_) heudiconv.populate_bids_templates(studydir_) - heudiconv.prepare_for_datalad(str(tmpdir), studydir_) + heudiconv.add_to_datalad(str(tmpdir), studydir_) from datalad.api import Dataset superds = Dataset(str(tmpdir)) From 77f607d69676aca65d4894edac87ce521bb9c247 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 14 Oct 2016 07:55:57 -0400 Subject: [PATCH 015/181] ENH: setup git user on travis so git does not complain --- .travis.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index da2d71d0..dcae7c32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,10 +16,6 @@ env: - DATALAD_TESTS_SSH=1 before_install: - # The ultimate one-liner setup for NeuroDebian repository - # which might be needed later for tools - #- bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - #- travis_retry sudo apt-get update -qq # for now even remove requirements.txt since dependencies aren't avail - echo '' > requirements.txt - pip install -r dev-requirements.txt @@ -30,6 +26,10 @@ before_install: - travis_retry sudo apt-get install git-annex-standalone - pip install datalad +install: + - git config --global user.email "test@travis.land" + - git config --global user.name "Travis Almighty" + script: # - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests - coverage run `which py.test` -s -v tests From 2f8ed08ea166f3bbbd03f802cedc06765cff283e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 14 Oct 2016 08:03:29 -0400 Subject: [PATCH 016/181] RF: modality -> seqtype --- bin/heudiconv | 8 +++--- heuristics/dbic_bids.py | 62 ++++++++++++++++++++--------------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 88c026d5..80de0e0f 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -543,7 +543,7 @@ def convert(items, symlink=True, converter=None, len(item_dicoms), outtype) lgr.log(1, " those dicoms are: %s", item_dicoms) - modality = basename(dirname(prefix)) if is_bids else None + seqtype = basename(dirname(prefix)) if is_bids else None if outtype == 'dicom': if is_bids: @@ -623,7 +623,7 @@ def convert(items, symlink=True, converter=None, # we should provide specific handling for fmap, # dwi etc which might spit out multiple files if is_bids: - if modality == 'fmap': + if seqtype == 'fmap': # expected! suffixes = ["%d" % (i+1) for i in range(len(res_files))] if not suffixes: @@ -706,9 +706,9 @@ def tuneup_bids_json_files(json_files): json.dump(json_, open(jsonfile, 'w'), indent=2) # Load the beast - modality = basename(dirname(jsonfile)) + seqtype = basename(dirname(jsonfile)) - if modality == 'fmap': + if seqtype == 'fmap': json_basename = '_'.join(jsonfile.split('_')[:-1]) # if we got by now all needed .json files -- we can fix them up # unfortunately order of "items" is not guaranteed atm diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index dd6afec9..c7f0c75b 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -51,7 +51,7 @@ def infotodict(seqinfo): # since we primarily rely on encoded in the protocol name information prev_image_data_type = image_data_type image_data_type = s.image_type[2] - image_type_modality = { + image_type_seqtype = { 'P': 'fmap', # phase 'FMRI': 'func', 'MPR': 'anat', @@ -76,19 +76,19 @@ def infotodict(seqinfo): skipped_unknown.append(s.series_id) continue - modality = regd.pop('modality') - modality_label = regd.pop('modality_label', None) + seqtype = regd.pop('seqtype') + seqtype_label = regd.pop('seqtype_label', None) - if image_type_modality and modality != image_type_modality: + if image_type_seqtype and seqtype != image_type_seqtype: lgr.warning( - "Deduced modality to be %s from DICOM, but got %s out of %s", - image_type_modality, modality, protocol_name_tuned) + "Deduced seqtype to be %s from DICOM, but got %s out of %s", + image_type_seqtype, seqtype, protocol_name_tuned) if s.is_derived: # Let's for now stash those close to original images # TODO: we might want a separate tree for all of this!? # so more of a parameter to the create_key - #modality += '/derivative' + #seqtype += '/derivative' # just keep it lower case and without special characters # XXXX what for??? #seq.append(s.series_description.lower()) @@ -97,15 +97,15 @@ def infotodict(seqinfo): prefix = '' # analyze s.protocol_name (series_id is based on it) for full name mapping etc - if modality == 'func' and not modality_label: + if seqtype == 'func' and not seqtype_label: if '_pace_' in protocol_name_tuned: - modality_label = 'pace' # or should it be part of seq- + seqtype_label = 'pace' # or should it be part of seq- else: # assume bold by default - modality_label = 'bold' + seqtype_label = 'bold' - if modality == 'fmap' and not modality_label: - modality_label = { + if seqtype == 'fmap' and not seqtype_label: + seqtype_label = { 'M': 'magnitude', # might want explicit {file_index} ? 'P': 'phasediff' }[image_data_type] @@ -147,7 +147,7 @@ def infotodict(seqinfo): None if not regd.get('acq') else "acq-%s" % regd['acq'], regd.get('bids'), run_label, - modality_label, + seqtype_label, ] # filter tose which are None, and join with _ suffix = '_'.join(filter(bool, suffix_parts)) @@ -168,11 +168,11 @@ def infotodict(seqinfo): # some are ok to skip and not to whine if "_Scout" in s.series_description or \ - (modality == 'anat' and modality_label == 'scout'): + (seqtype == 'anat' and seqtype_label == 'scout'): skipped.append(s.series_id) lgr.debug("Ignoring %s", s.series_id) else: - template = create_key(modality, suffix, prefix=prefix) + template = create_key(seqtype, suffix, prefix=prefix) info[template].append(s.series_id) info = dict(info) # convert to dict since outside functionality depends on it being a basic dict @@ -282,17 +282,17 @@ def split2(s): return s, None # Let's analyze first element which should tell us sequence type - modality, modality_label = split2(split[0]) - if modality not in {'anat', 'func', 'dwi', 'behav', 'fmap'}: + seqtype, seqtype_label = split2(split[0]) + if seqtype not in {'anat', 'func', 'dwi', 'behav', 'fmap'}: # It is not something we don't consume if bids: lgr.warning("It was instructed to be BIDS sequence but unknown " - "type %s found", modality) + "type %s found", seqtype) return {} - regd = dict(modality=modality) - if modality_label: - regd['modality_label'] = modality_label + regd = dict(seqtype=seqtype) + if seqtype_label: + regd['seqtype_label'] = seqtype_label # now go through each to see if one which we care bids_leftovers = [] for s in split[1:]: @@ -312,12 +312,12 @@ def split2(s): # TODO: might want to check for all known "standard" BIDS suffixes here # among bids_leftovers, thus serve some kind of BIDS validator - # if not regd.get('modality_label', None): - # # might need to assign a default label for each modality if was not + # if not regd.get('seqtype_label', None): + # # might need to assign a default label for each seqtype if was not # # given - # regd['modality_label'] = { + # regd['seqtype_label'] = { # 'func': 'bold' - # }.get(regd['modality'], None) + # }.get(regd['seqtype'], None) return regd @@ -330,14 +330,14 @@ def test_parse_dbic_protocol_name(): assert pdpn("bids_func-bold") == \ pdpn("func-bold") == \ - {'modality': 'func', 'modality_label': 'bold'} + {'seqtype': 'func', 'seqtype_label': 'bold'} # pdpn("bids_func_ses+_task-boo_run+") == \ # order should not matter assert pdpn("bids_func_ses+_run+_task-boo") == \ { - 'modality': 'func', - # 'modality_label': 'bold', + 'seqtype': 'func', + # 'seqtype_label': 'bold', 'session': '+', 'run': '+', 'task': 'boo', @@ -347,7 +347,7 @@ def test_parse_dbic_protocol_name(): pdpn("bids_func-pace_ses-1_run-2_task-boo_acq-bu_bids-please__therest") == \ pdpn("func-pace_ses-1_task-boo_acq-bu_bids-please_run-2") == \ { - 'modality': 'func', 'modality_label': 'pace', + 'seqtype': 'func', 'seqtype_label': 'pace', 'session': '1', 'run': '2', 'task': 'boo', @@ -357,7 +357,7 @@ def test_parse_dbic_protocol_name(): assert pdpn("bids_anat-scout_ses+") == \ { - 'modality': 'anat', - 'modality_label': 'scout', + 'seqtype': 'anat', + 'seqtype_label': 'scout', 'session': '+', } \ No newline at end of file From 0a45c0c41355c02586993181d32ada8838813480 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 14 Oct 2016 16:13:55 -0400 Subject: [PATCH 017/181] Fixups for datalad'ing the datasets, having no spurious _run, and normalizing sid --- bin/heudiconv | 11 +++++++++- heuristics/dbic_bids.py | 47 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 80de0e0f..8da0b930 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1062,6 +1062,7 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outputdir, # TODO: probably infotoids is doomed to do more and possibly # split into multiple sessions!!!! but then it should be provided # full seqinfo with files which it would place into multiple groups + lgr.info("Study session for %s" % str(ids)) study_session_info = StudySessionInfo( ids.get('locator'), ids.get('session', session), @@ -1179,14 +1180,16 @@ def add_to_datalad(topdir, studydir, bids=False): assert not studyrelpath.startswith(os.path.pardir) # so we are under # now we need to test and initiate a DataLad dataset all along the path curdir = topdir + superds = None for subdir in [''] + studyrelpath.split(os.path.sep): curdir = opj(curdir, subdir) ds = Dataset(curdir) if not ds.is_installed(): lgr.info("Initiating %s", ds) - ds_ = create(curdir, force=True, annex_version=6) + ds_ = create(curdir, dataset=superds, force=True, annex_version=6) assert ds == ds_ assert ds.is_installed() + superds = ds create_file_if_missing( opj(studydir, '.gitattributes'), @@ -1195,11 +1198,15 @@ def add_to_datalad(topdir, studydir, bids=False): *.json annex.largefiles=nothing *.txt annex.largefiles=nothing *.tsv annex.largefiles=nothing +*.nii.gz annex.largefiles=(largerthan=0kb) +*.tgz annex.largefiles=(largerthan=0kb) """) # so for mortals it just looks like a regular directory! ds.config.add('annex.thin', 'true', where='local') # Let's make it a dstop = Dataset(topdir) + # ideally we should save from within the subdataset, + # so let's dstop.save(auto_add_changes=True, recursive=True) # TODO: they are still appearing as native annex symlinked beasts @@ -1373,6 +1380,8 @@ s3 # TODO: --datalad cmdline option, which would take care about initiating # the outputdir -> study_outputdir datasets if not yet there + if args.datalad: + add_to_datalad(outputdir, study_outputdir, bids=args.bids) convert_dicoms( subject, diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index c7f0c75b..5ff6f979 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -1,5 +1,5 @@ import os - +import re from collections import defaultdict import logging @@ -141,6 +141,9 @@ def infotodict(seqinfo): run_label = "run-" + ("%02d" % current_run if isinstance(current_run, int) else current_run) + else: + # if there is no _run -- no run label addded + run_label = None suffix_parts = [ None if not regd.get('task') else "task-%s" % regd['task'], @@ -203,9 +206,14 @@ def infotoids(seqinfos, outputdir): # decide on subjid and session based on patient_id lgr.info("Processing sequence infos to deduce study/session") study_description = get_unique(seqinfos, 'study_description') - subject = get_unique(seqinfos, 'patient_id') + subject = fixup_subjectid(get_unique(seqinfos, 'patient_id')) # TODO: fix up subject id if missing some 0s - locator = study_description.replace('^', '/') + split = study_description.split('^', 1) + # split first one even more, since couldbe PI_Student + split = split[0].split('_', 1) + split[1:] + + # locator = study_description.replace('^', '/') + locator = '/'.join(split) # TODO: actually check if given study is study we would care about # and if not -- we should throw some ???? exception @@ -216,9 +224,9 @@ def infotoids(seqinfos, outputdir): # to figure out presence of sessions. ses_markers = [ parse_dbic_protocol_name(s.protocol_name).get('session', None) for s in seqinfos - ] + if not s.is_derived + ] ses_markers = filter(bool, ses_markers) # only present ones - session = None if ses_markers: # we have a session or possibly more than one even @@ -269,6 +277,11 @@ def parse_dbic_protocol_name(protocol_name): # We need to figure out if it is a valid bids split = protocol_name.split('_') prefix = split[0] + + # Fixups + if prefix == 'scout': + prefix = split[0] = 'anat-scout' + if prefix != 'bids' and '-' in prefix: prefix, _ = prefix.split('-', 1) if prefix == 'bids': @@ -300,6 +313,11 @@ def split2(s): if value is None and key[-1] in "+=": value = key[-1] key = key[:-1] + + # sanitize values, which must not have _ and - is undesirable ATM as well + # TODO: BIDSv2.0 -- allows "-" so replace with it instead + value = str(value).replace('_', 'X').replace('-', 'X') + if key in ['ses', 'run', 'task', 'acq']: # those we care about explicitly regd[{'ses': 'session'}.get(key, key)] = value @@ -322,6 +340,25 @@ def split2(s): return regd +def fixup_subjectid(subjectid): + """Just in case someone managed to miss a zero or added an extra one""" + reg = re.match("sid0*(\d+)$", subjectid) + if not reg: + # some completely other pattern + return subjectid + return "sid%06d" % int(reg.groups()[0]) + + +def test_fixupsubjectid(): + assert fixup_subjectid("abra") == "abra" + assert fixup_subjectid("sub") == "sub" + assert fixup_subjectid("sid") == "sid" + assert fixup_subjectid("sid000030") == "sid000030" + assert fixup_subjectid("sid0000030") == "sid000030" + assert fixup_subjectid("sid00030") == "sid000030" + assert fixup_subjectid("sid30") == "sid000030" + + def test_parse_dbic_protocol_name(): pdpn = parse_dbic_protocol_name From 9efd82ec38748dc7de0ff1cf41fb46f3087f42fe Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 18 Oct 2016 22:31:11 -0400 Subject: [PATCH 018/181] ENH: reproducible dicoms (yet to test) by mocking out time.time and providing dicom series time for files mtime --- bin/heudiconv | 63 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 8da0b930..2e485915 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -232,6 +232,8 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): # and how would then we deal with series numbers # which would differ already for fidx, filename in enumerate(fl): + # TODO after getting a regression test check if the same behavior + # with stop_before_pixels=True mw = ds.wrapper_from_data(dcm.read_file(filename, force=True)) for f in ('iop', 'ICE_Dims', 'SequenceName'): @@ -493,20 +495,58 @@ def compress_dicoms(dicom_list, prefix, sourcedir): if os.path.exists(outtar): raise RuntimeError("File %s already exists, will not override" % outtar) - with tarfile.open(outtar, 'w:gz', dereference=True) as tar: - for filename in dicom_list: - outfile = os.path.join(tmpdir, os.path.basename(filename)) - if not os.path.islink(outfile): - os.symlink(os.path.realpath(filename), outfile) - # place into archive stripping any lead directories and - # adding the one corresponding to prefix - tar.add(outfile, - arcname=opj(prefix, os.path.basename(outfile)), - recursive=False) - tar.close() + # tarfile encodes current time.time inside making those non-reproducible + # so we should choose which date to use. + # Solution from DataLad although ugly enough: + + dicom_list = sorted(dicom_list) + dcm_time = get_dicom_series_time(dicom_list) + + def _assign_dicom_time(ti): + # Reset the date to match the one of the last commit, not from the + # filesystem since git doesn't track those at all + ti.mtime = dcm_time + return ti + + # poor man mocking since can't rely on having mock + try: + import time + _old_time = time.time + time.time = lambda: dcm_time + with tarfile.open(outtar, 'w:gz', dereference=True) as tar: + for filename in dicom_list: + outfile = os.path.join(tmpdir, os.path.basename(filename)) + if not os.path.islink(outfile): + os.symlink(os.path.realpath(filename), outfile) + # place into archive stripping any lead directories and + # adding the one corresponding to prefix + tar.add(outfile, + arcname=opj(prefix, os.path.basename(outfile)), + recursive=False, + filter=_assign_dicom_time()) + finally: + time.time = _old_time + shutil.rmtree(tmpdir) +def get_dicom_series_time(dicom_list): + """Get time in seconds since epoch from dicom series date and time + + Primarily to be used for reproducible time stamping + """ + import time + import calendar + import dicom as dcm + + dcm = dcm.read_file(dicom_list[0], stop_before_pixels=True) + dcm_date = dcm.SeriesDate # YYYYMMDD + dcm_time = dcm.SeriesTime # HHMMSS.MICROSEC + dicom_time_str = dcm_date + dcm_time.split('.', 1)[0] # YYYYMMDDHHMMSS + # convert to epoch + return calendar.timegm(time.strptime(dicom_time_str, '%Y%m%d%H%M%S')) + + def safe_copyfile(src, dest): """Copy file but blow if destination name already exists """ @@ -1186,6 +1226,7 @@ def add_to_datalad(topdir, studydir, bids=False): ds = Dataset(curdir) if not ds.is_installed(): lgr.info("Initiating %s", ds) + # would require annex > 20161018 for correct operation on annex v6 ds_ = create(curdir, dataset=superds, force=True, annex_version=6) assert ds == ds_ assert ds.is_installed() From 3c45bbf728e947b6c69807d574d207c192d1d90e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 19 Oct 2016 23:10:16 -0400 Subject: [PATCH 019/181] [DATALAD] new dataset --- .datalad/config | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .datalad/config diff --git a/.datalad/config b/.datalad/config new file mode 100644 index 00000000..423a607f --- /dev/null +++ b/.datalad/config @@ -0,0 +1,2 @@ +[datalad "dataset"] + id = b90e9412-9672-11e6-814e-8019340ce7f2 From a2b03ce7ac69ce80d2cb9753ab76395bac837546 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 20 Oct 2016 00:06:40 -0400 Subject: [PATCH 020/181] BF: workaround for datalad issue with create/save dance + removing trailing spaces from json files --- bin/heudiconv | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 2e485915..e100c194 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -126,8 +126,14 @@ def save_json(filename, data): Dictionary to save in json file. """ + # adds trailing whitespaces due to indent + # see https://bugs.python.org/issue16333 + # seems to be fixed in 3.4, for now fixing manually + # not only for aestetics but also to help guaranteeing the + # same result across versions of Python with open(filename, 'w') as fp: - json.dump(data, fp, sort_keys=True, indent=4) + json_str = json.dumps(data, sort_keys=True, indent=4) + fp.write(json_str.replace(' \n', '\n')) def load_json(filename): @@ -1210,24 +1216,30 @@ def add_participant_record(studydir, subject, age, sex): + '\n') -def add_to_datalad(topdir, studydir, bids=False): +def add_to_datalad(topdir, studydir, msg=None, bids=False): """Do all necessary preparations (if were not done before) and save """ from datalad.api import create from datalad.api import Dataset + from datalad.support.annexrepo import AnnexRepo studyrelpath = os.path.relpath(studydir, topdir) assert not studyrelpath.startswith(os.path.pardir) # so we are under # now we need to test and initiate a DataLad dataset all along the path - curdir = topdir + curdir_ = topdir superds = None - for subdir in [''] + studyrelpath.split(os.path.sep): - curdir = opj(curdir, subdir) - ds = Dataset(curdir) + subdirs = [''] + studyrelpath.split(os.path.sep) + for isubdir, subdir in enumerate(subdirs): + curdir_ = opj(curdir_, subdir) + ds = Dataset(curdir_) if not ds.is_installed(): lgr.info("Initiating %s", ds) # would require annex > 20161018 for correct operation on annex v6 - ds_ = create(curdir, dataset=superds, force=True, annex_version=6) + ds_ = create(curdir_, dataset=superds, + force=True, + if_dirty='ignore', # see https://github.com/datalad/datalad/issues/1016 + no_annex=True, # need to add .gitattributes first anyways + annex_version=6) assert ds == ds_ assert ds.is_installed() superds = ds @@ -1244,10 +1256,13 @@ def add_to_datalad(topdir, studydir, bids=False): """) # so for mortals it just looks like a regular directory! ds.config.add('annex.thin', 'true', where='local') + # initialize annex there if not yet initialized + AnnexRepo(ds.path, init=True) # Let's make it a dstop = Dataset(topdir) # ideally we should save from within the subdataset, - # so let's + # but https://github.com/datalad/datalad/pull/987 is not yet there + # so for now saving everything dstop.save(auto_add_changes=True, recursive=True) # TODO: they are still appearing as native annex symlinked beasts From 4d2c633f8851327eaa3992374154471fef981402 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 20 Oct 2016 00:12:07 -0400 Subject: [PATCH 021/181] Adjusted description a bit (might still not correspond fully) and a version. --- bin/heudiconv | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index e100c194..01d2e379 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1,19 +1,25 @@ #!/usr/bin/env python -"""Convert DICOM TimTrio dirs based on heuristic info +"""Convert DICOM dirs based on heuristic info -This function uses DicomStack and mri_convert to convert Siemens -TrioTim directories. It proceeds by extracting dicominfo from each -subject and writing a config file $subject_id/$subject_id.auto.txt in -the output directory. Users can create a copy of the file called -$subject_id.edit.txt and modify it to change the files that are -converted. This edited file will always overwrite the original file. If -there is a need to revert to original state, please delete this edit.txt -file and rerun the conversion +This script uses DicomStack and mri_convert to convert DICOM directories. + +It has multiple modes of operation + +- If subject ID(s) is specified, it proceeds by extracting dicominfo from each + subject and writing a config file $subject_id/$subject_id.auto.txt in + the output directory. Users can create a copy of the file called + $subject_id.edit.txt and modify it to change the files that are + converted. This edited file will always overwrite the original file. If + there is a need to revert to original state, please delete this edit.txt + file and rerun the conversion + +- If no subject specified, all files under specified directory are scanned, + DICOMs are sorted based on study UID, and layed out using specified heuristic """ -__version__ = '0.1' +__version__ = '0.2' import argparse from glob import glob From 1d548eca3646a3b906ce0cee94626f0f9fcee29d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 20 Oct 2016 00:26:36 -0400 Subject: [PATCH 022/181] NF: added a single sample dicom file (fieldmap only phase from dartmouth-phantoms/bids_test4-20161014/phantom-1/fmap_acq-3mm --- ....5.2.43.66112.2016101409263663466202201.dcm | Bin 0 -> 122162 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/data/fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm diff --git a/tests/data/fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm b/tests/data/fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm new file mode 100644 index 0000000000000000000000000000000000000000..bafdaa512ee74b9c4379f60d66c10a6ce81de89c GIT binary patch literal 122162 zcmeFa4V)xdRVNzNU6o(`QO)$s&;u?6h2RX(Nk-&1^bAu~S=CibeNAQc$1tNsRAy9F z+L0MG`O#esvJ5MVuCCwvob|{0#{Ipw!qXM?@q2zNDCqcv1wnq|f{F?(Dk|<%e&V9M z|G5!2z9TE6GMaIJYj)qt$cTH-J@?#m&OP_sb8iU3v-_zn&##^U3RhMpg?HoogfJnj z&(SwN6DRO}W%+r6l$EoULM|)ia=A*jn3b{xB>ctmlgLp(j(k> zHrj>8zIsv4=ERgRCCt||!g76mdwEj?k|`mDJNF9fo6DD%*Jf89-`HGUo!xx&@r}pV zYRG?=P@7E&9I=#eUf5baD;Nl7OVvD@iCp&{(e{*Z;n;0Y38Dvw7kPfqcz(aw^E=0J zmh$hfL8vL+ep?^(_r-Z#)6_jxyeMwW(et+m^{p2OtIF+; z(rt?QS1c(V{ifP|#oE#}~heUcZnM9zw3u!fsnR*j5@(T+FxIDdAz<%Lp~~ zfG|@#{gkkA&Zy&i@FXE@bX&c)f_%@ZKOzXbVD*3eitwp#z&CO}XE?{Fge%A~D#sV| zxne1wtK@T1B`=)=$8W5i<Khk?8C(hA=ohm)a_xffwcv!5@Uq>u+?2a0^%C}k%5MlY)Su55as4E_u4c=) zeha%UXY-W{!Z+jkgdpMWTX6*p<$P8y$|YI8AbblT`^ zE(nj@RbQGE=Je+63&OecSL!Kn!Nd4Rw7Bd1JleUrehu*MK2Pl_f^Js}Vs?!`UO zJDbbP#gbGgNENx9FJBNYu)Kt;@CZRzTC9urq3>7L@3N!^I4v#A))qGLBz0~cJq8tZ zc|lyevMLsEFAe-B(XTmNpF)o&KtWl~7Vi_LAt#f#lS4mD9l6 z`279ff8{Ip3Fiau-FQGQFG=vAmwGKEIDV~ z!(CB0i}J4;R-UVBrlRvjj3Q8BvQ*7uWWYZWv}GIxT%Kca53cTFR}!wyTURppMHVhf z7{+qdVmV)x3i(Ru@rzOc168Gzmr8P>S}5QKxVu`FAh%MnTn3CXN;W}vNxUB;?iKXygmAww2{}J4tgf%E&o6DRuP(@Vijf*mAI8(? zSXa)AvI|6Q5nI zQf-g0>tdmlyI0W$ailR?`~0PJ&Q8;LI$g!)w}H+k~JgbHC>MOtX=>np8} zx}o=4{g&P#8Qff~U#r!WzOr@uKoxph*XtV#^I4_0HfU(-pts!F)wA2%O26N2?Mxv5 zGW2f(-;@Yu+D!bKx{p4%ZGApyT^Se0{Z05|^d7W3Rts(r#0j8F-(SIpzW1A7oL>6t zd(KY?gQhyKce`qXnw$Qofd2+Ydq#Y#35rHI^Us8EIIH6ar#Zh*-=p9!mF^tULxvj(5_5nS=vSRQI z6(l}zz^sGEI(x#x{9U*&^7M{1qmK_Zy13=)ntz`9Rii(qaAQSpHP?6NyQ~+XJ5)IA?sy^QmX}V>n`;~ z9N~Y3jXE}6EtqQxXac&RA6a?U5F3I`IBU(&=vpI*0n8z9hg`WA{N4*OUl<`Ysh- zRw^*fjvx3O7gwXSUEXA!=Au66w$v_}e?7sCCs|VtYP+~7{nkRK zp*LX~)LNz`$7D7SnY`T92L~qGbXHXwW4TCOgizzL6Ms%uf_W7TDR72c(Atj`zCQcZ z$L>C_92h)FDxt3QojHnMpB-rReTCUEi!}jr5aUCAwXv`CR3D;-k#|!SZnpLmD0Y|v zN>-br(RhDJ*P2|Kw+Bu8w0I}v3Jn8ru3isy}Qr%nK(fc zY;;xrkaU)p&I~`+?LxCYI5_C4J*vUbqOQ^!>k`}iUuEhh2u~)oYkIe>Xjd^+(7W); zbs+Dz=NMSINQ2pH>jz3h%{HOT7+_||g9ebRAUv(MTWwY}KxCX-vY-tbtZoOar+)wL z^JD@Mp5*H~hj022J!z<&-GzBm@57`Nc67s9;B6;o3y}wsk~K8!)}ZUmL@>Q$!71FM zv}%5nXTTTz%EAaKt`D%Ye(80=94gH-9e7U(RX-lj~*Zg7BodD}$Cc$Qy>!VV^^3T!#6vzPoF*C!u2kGC19^YXV!o z1``4~gJ53OVHa)fw;DGvP+&aj)eqDL__Hx7?7c<#e}s+y``W25h>xDS`z2>H_aj|G z+CXX|9U_IoWJ0sJepNu;S0XRccOpHF^lqeQkUoj@1*FN`*~|r`S0Fu%6bchYXcInt zf?6n_%`723j&y+ZR-~tpejMpNNWX*hIixQmovRFi$>-&DWlwnUZ06%gpGOj2e>QVB z()~!6kT#H-NQX$@iS#s5D4rZ9l##Fx+>o9^`f;T9ApH)~=a9aPbgp$a^R-A7q!pwi zVQzI5K?iO~_aj|G+CXX|9U^@v($h%qMtTP6lSrX34IM}=&!ldi&3p|~4e2_PhV&%T z4m$g9!Yo?=z(-U(j}w~q$biKQYg&j&W5IR)J?rFq<#TyN6I19kgg+X zNKYdD0Ma{g*r@A-4I@2}84r0+y}8tL6g&mesg=?h4cUpSi~yTS{T z%~@z@BiD;Kop~DR-AKAL*Nrz6a?<08BQft>_Jgq%fWN z#nN=`!wR6trn`f;T9ApH)~=a7Qv$za-g3Gz4%d7OqkPD382A&=9L z$7#spG~^NOKzcXQGf1CA3WN#O*6$A5WX^A@doUwblpR$Qes*^{^8ut!A^i4iPQUCF;mK`yT(WVK!CX~3a8k`Szj4s(1N1p~pXuK@V0RC^MD+Jw;XdJO zkX|5&Sao{|6=$DNn6rbvPG)XfIN3gNW1_(eSl?J1!dlE}tYT(hdC)3&5;M~T*2f9v z3VeBYn|Wz`H|S|%Xu!qu=VIP^UU(sJy+?Q{{8ld!9t3{$IR~1i@Ra!SUi=G%`7iaI zW!UgW{Yl_JSWL2im zPvdWjK_(pOinrWoySI-5=RT&FG{G=V0t#Vh;7b&xE28Tu)J=GL`O|INEbQ)*ZyXa8 zijm;|D58S?O(4;XbqfEeCaRNQdUeQruNA27TJWPhB+fv@)EO~jPF_STk`(!`H?g}x zx{kvPP84l{CQO<&*L)Nl1>9^ydXNI-*h9bmfD9ob`N&>oQ|?q7tv1-C3; z^Hb0H;`HC`J@?6<-69af&RE!^3-C^$N1&#TXn5jIoR3`s~$P=f< zyv}^1gr}E3IZW31CP-v6K3Qdx%82x9q7O;)w;>27V9oCIwiK`qA3dokYtw|Fw;skd z`N5WJS=YMBGiME25HHoS#y~!muz9r8FFMSZ;eB=y4mVm}0ch(p>Rf0|XZD5CWyBIB zcz89F{B>0Wm#F59CDQ}N(M`BVzB$oML@PeO9;IE#&OW2!cU1mWth_y9I+KLm=VILj zOs#Ala%6dV_TB3BNmfMqF|nZ0w-bN5O?a(5Vh@x=0s>FZydNrywUNGv?<^yau@*%4 zz50y_4(FWh4NRf4+H7s!T(+9mv3O$A)+YnQQ_=o!(IGw2UV9PIRn8uP6NLW~8N`t9er_SJg;$2ICQ?c11 zEy01~J1+b$JHss+eYW!H)bk=Acd^J$)jN5NY>;Ogf5Sz9MNiBfS_cCSJ}QGu6Z5(V zd=0PDJX@b7{OL0xY#w&-O@kWY&t-z15nUl?8?#&FwK5l0a1Z`>#(I>8f1Y`s-N!^Y z%zyXE@8{%4%1bckw%cr}MxcR(Q5hpHv!K8c@Z=Np)5s_C#OW@kXK;(z4tyM--wdYG zY2G3WYeVfaL(+Eos{(*NjC>-JFkvk9XAyit3W09Xhw${mwC4>g>P@4|t-KS0Ru}Qh z9GW)S1M|-_FE(gFKYcLU-4;f=CW|xaQRBIn)DmYOP3{65(M*IVkEy^VYx^Q#ZmKtYmp>nwrwnFkM_#5~u;^`T7iO3C zsn?flrd16WCE@DDbKy{FuYjN0O7{lZ4fsxKa-fJ3I8YHVgq0GLm-yg9czWl`1d||b z>Tt2!W=_81H*;*m&A|x-vyS=qa|vO5i04-~t(6RLBcm2KE0}zG>Bne-C4Np5g6Zvd ziXFJ7ceUnK3jbJD4i3oqQ4)-g~tVvXEJn^a|f7cWe6v8U7TVijbqAh3xI%!+5^ zd(L~z3e&pgQe{{-gr|4CWp(qas_6|Z+wg@wbExFX=?HQJwcQKRS{?$CMWbkF%9c0$vAZNEx!b}B4{hRuTnJ}m_p%>mpLNKd_c zu&+NdZ>`Z-v*R#nWE7D(MF^%>j#f>WhKcFd^)g2PAXa+jRrj3FqaTA{GI}ndZoPY3 zSTN?u!4_&DE2%_JuYE*=3Z+%^o5#_$4#E4cTB|)sUXtzk)zt^3?fDx1A|KfWl&B1S z&(FV#++mt}INOZFHd)==p2b|5iAv}s)Ga11aS!6}8Y5dn?`|)5`o=1%y#$Ln++#8p z+J`WB5Z@pur(08Z5xUT1T_ZSj7WqURKy7AqD9j){K~CWTM2H)^MjjCELwv3?idJM% zw9kAi=2B)9Eeu}P6BMy%dfu<8t-bxe;L@*GmYvgL6H^HF@?J;pk`gCu_z1%-P_K1g zHK#R%CI65%c7Bfgo39#Tp}p6jDI{m&FwB+3#jWl7{OrmCTkIbN|2sJ=WAom2tB-Z? zd911LnDJ19JsLok(KX#Z4@%ha#OSe(BK)}rlhgkU?onDbzhxO)kSKf}5&XDieU3|W z4uF?uqZN;eKx8aJ0}!e#uqf*woE=?6qwYpf8v<7HeIZ@EelI>zf13-h{uaRe2ay*R zms$KJU64Tn!+UFKWBVGGMCNkTInOr!hPHD=rr3S3+f)0@y{v-4vWhl`C3xI=t&69g zZTvamYpSN)W?=zo}iQ&xU zD~Avg{tRH)XH>r1+(kH0W522Iu~DD`` zn>Keh^jm6oq1)BFut*TK#VrL4#crfd*`i+HSg;iTH?nmU;WU9xii-sfn4@oz z)5!Tvcuxr841IGM=Dg$O`v4Ug8~yv+aLe^r3p}fX^H?{7D41)pSces7mOD4~8){E9 z6bJT38@qI#P5c@C`z(x45J5}ZBdYA+r{xj-**E;v^ye1HBTDmR-YBW`Y}23NueD&I z3JT#%9?~q(qig<|fFtfJ(^wrQ@1(fSc*}nF?D9|j+V%~MrrW#94H~?x&y(M1@A9HW zckIOL3D83~`Qm<2E{Zet+9>qL_7Ja$5F7s+`cpaF)|nenfagyn-7WHDd3rN+%g0Ya zW1C-oE7(zsy}1G7Rvx=qi-k%-${Ti+({7TnCpItR9E6fgRkP2ide&+eB0aY;mu&L} zPAqpsKoZ;f%@+EC0X6?XA7%D&U6QNfDE*Slmax}ZDwG{-L8Icw>kZ4#@P}Mbz6oJw zUc@)~|0r;S?(xr)ZF~0`ru){)Kv2JUZBMMbVh>EITB#I$HY>LFW5Y9cJy#08XRvh( z6>PGqRMDa4C#)Ub=d4|(urPP6DY2m}EukWu z;JIND13d7E!=mpebz7Gmh5Ky1V!@@dVYtvU`i%OVaNinqU{hlSgvJ3_bOP)P(Fls7 zt2a8XSu;oviDo*1Z$3`rlGRHvmqBTJY&3$eI7rH8eGo@QUPX~^Oc?S=mwf)j^_JH+ zCF494`cPi)GpAklzw$Gp4ShH-thRa$J|{7P7{yCQ4)5N`d^v8|J-()2mV>+{#h3dlLx2@_TjoH>hIMq->ZIA?b2R50Xw)@bd7@9I^uh= zrK_bo0(>v6(By|13G|3`8rNNXKJ3M}DB%zrSe_g=`aUkc90o5>E+6s(u^nP~$Rld( zF%luw2qm5|2U}jAePf&>{P}7g-6Ob&qQ>fpu@B>>YAi}k2nHY076MSEPI&3>PCKeJb?Iq!Stk~kFUj?INBM4sZRG|v1ca!QVN(dEVCMZI;Y^55?6D{T5i@qI7cd#2+w_62 zv3EbVz|hVKqCdCsIsLu-b0_D9L816c+fY3WS@X}6-)!>}oj9ZK!penuB>az(>v5v$ zIS!m(yuT?LY*xcYW8>Tg0W5v#Dvok+ZcL$hj>r?&TQfD*Kj(0NamAR+;*#lpGYFmq z=difu{-)C5lVSzC+GqHd7e4E9PU$it(Zmar=56N2o>uPwYkH31Gyr8jr4^t2_Pfv1 zU`sH)a!T`dN~`Agor=h{95gr+L%HqPuM^-00sYY&#By-cLDLvLIPO4|(HNR&Q*Cwd z1&4J5Uh#N`oxt{N?gClv=?%7sfb~N+SF@f)-?G`Qp&$_A zN9&3R6t*~l`sLZgAGYkiu6A07ydwm)OJwfRfeLOG@~R0Ya}57Aqt`iKRS=CqiTH}X zFJ0%r;n*bq-z6G+q0P;zak7EJbFeBeE!Q3rjnz__M%_yk?}U|avb7%)>F{9Or2CAv z2$#&x7z8z(i+IAH8FXLf%fML7(ZNuh`lc_^uK^hZz+iU!aAjtwG)I-jCcJrlfF&4n z2i{TjNX)b_t!)P=Ljh)IIB{~L0-OmY`3&2FQpXk)ENvk+l#c%G&UmTL5OPz!R6_Iv zr~AUySpzq=6@<0YXk&?Gp(Vi_B-PsPBJvg(HPLek&j}6tj*L5wRW}^Th$)E4g0#dq zGee7U?6%0xJFy=Y@NflTuA&XQnZVeTs7_>oX&n$#xo;Ix+mV&2TUfQ zfd&VnBKq3}1~mGxkgG7DEYbnXi2Z$4?6AFsqQ1+1H3m3-5vO;uJ}~bE0DQbd;b;bc zmy1{|?&}>wLTNUE0V}LPAp$&hL37v|0zaTF?UENAC1La;g2#u019dEa2CaWGlH1uN z>e}Td+R)lX9JZ)H9}_-=D^TK^yI~5OqagLU5w>WwOH|eJuMN} z;Y?~>WOGjYKCvBV9rk>a=A_Ps+Ifiu$tDw-Q6|;UN(+F|xhl89?ct1s^AU9v?SV~)m`oEQUg?U7a7 zvjFBv1m;vgBbr%f*xApim)MC$!H=n%BnXWO*e*u|kU=)z18R64$GH0*FwTSM`fw~T zS}_+EZ)1TnyLq!fygA!cjP-6aBU|vsiuY??^BRsC&JL%tHS2et6r{Cj%DzpcUeeq8 z9;}f!utSd7SS>c;tzwFgmdwOK=N68*SDPZXA%Gv%4(!GTi`TYA8pbMMF%!|~++u;q zeoa+wSk)DX(5u+hHfY=Th`Ofhi#Lc7!`kp+XQf6t*wevlMMfFdGn>t@432`91#6&J z!V$ILy;hI*v2t1u=K=KioLnhiVNuWsr z)FK^5D>ltZmf7r*Oax=O8X7F7&}>9S5AT~3cncB{l|iE21AxSY#A=XCjC)x{KpHUS z)sW^V?|>gI#+qPS-9Gch60Q>CG!HhgW7`Kl9xus6N7L9aq692_iGn!r$HtHt6Va5` z4JPo25j1X){fGZfQ#8yNH7r9$uO(dvFS^V_pCM`3u3(m1%I)4Fn@77#_zl#VT_YPX zX3L!1gX@EB*vz%nYbzcs3@?{uKO6Zr6sSHcqqT;QT+CRt&24V5f?iG4So-HEQzmW) z51A-AA%M;6c!@+?HQnqE42}=0ruOKVHSpp%;M7$WyFJN1HTy8Qc?R7rGC7$90KmCc zX`3|vOj3&|LA`8{G=!m<-AA^lMOjW)ld8_y_{?(8%1>@>oXZU}!>XK@99_z6wjNR& znYbafnEvu1()Rp7ypF63GV zpuFKSA-!e}H6A~S11qO6aE!v{>I^+sNBSaa-80Ig+T(S@L}W>`ahCId`3~y0TG&<* z(n4||+g_iD)tO!Dt}dh^PD(4Y+p*!JWb{e*k+iiK(sV+x0&X23kuT2TB+gyII*ztW0b3%SRH^GrDcQo}(#ZUn3EZ2wZ=7~=11yFjo8bkd+<`9p>} zC5~*g)!i`4ol`u!+gX@`^$${ig}bV^b%(Y{_SE7KcO6xzoC?JTUy%aMZS<_6%?2^Vg7Ef^mrTkjha7hj&lC6cE{!50OLkGv@EjyA{;|Y~z z{nwe34z_xu+hRL=j!>Ysf5F#sr!!(W`ho0gs&lz!ZmJ7FWLP_dY@bu8sHEm}I8Xws8Y#^PT1-@S#L#Rx+_ z8ie}-&6ZLy5d2|S53_Y30N_1B=4~D;W3IeVG2WQ4zT^0jgVgHquPTPe>m*-$Y>MkJ zJ3Rdq)}T=L>Yb!G$8RK(-w6zlM{&+c0h)t+dgN?#un^ZXzJ4-CHq*sPCE1w;aBwq_5VW@l4L~FRVa+qYM)6UW>$LqAPCQ)x~>4^MnhkLH5 zMy{KfD;3V@U0%Yaf&NJ~>Z%bncCVJye#o`c0;6YErg8kkjH84gYGB^$! zsc1DUdLzP?(MD>-Zev6Qco&deA~qkhTecLHol78TC>#ciA%WmA&0u-F;XPj%#c zhKW}|w2j%?fEnp{kvDgWDTa{q5sJxij5DD$wD^d^70k0NSy<4KRAliD0g*(0D*jI! zRWwq^ImE~w7Jc@+f81nT=p-U{ANn6q6q6^|gawp%?itza&=Src)C@BM!&M*Uw-B{6 zTen=^9!KPIs7A5o4KP`|?zlcIw(R4zRO0TS3t{RaR$V>78&N36!-)mVm#@;Sf=I$+ zC9D=7NdyFf;amU;M&82QkOHKSCdTmSv5|Oe9E)v~Wvn5_m!%-68E89e^~iRtj}fd` zosrt@kgf2LCliumkP6Hj8Wm-Cx_xDFgdqc#mnqmE9qPAmdarF!8_wiMBf&$dDTaOo z{^Qxj4$D6(9`2dMat9WZqQMW@6nAc8UShny)-Ah`sJ3#jtu&svm~UHAb<{t+QT*Ga zuhVNy?>WG!4>-ol(WcOG%UaSsxXn)-#|}|P$*?}5YFs5;^oY^;Hx4(YARqHJ#m-=# zFzkW~-B|JshDrdXSm*++<%s<=Vpb{QBC~=K4yVOYtRhL&!tG%Y;wmtQe`R882l- zFd>KD!>R;!1LQ@kk}J#G^Ggf!ubo@J4zLE+N5ZvYg-n>`vWPg=5exfbRvd^S*mgY3 zthIgTQnb-V!^16jLLG)A;NYa0>F5ra`c0_5N!jO zuqkFZbb1Rn<2wmKym^uRwVLF5h(4vK5@Nhf4zV7~<&ucU^u-@jmH<&Prp@edKlnB7 zoV*zI7~O_y{+Tst!`Tr1ay-_w7=RcfI)4z0)DSX{t;?OpZ75O}xV*hMyHZ~u)_5a% zEgdn~jPKkfr0v<8N=s9ATJ%b5vrw4gbgEQa^*Qeb$P(#hyp)p^^7`7!@)|HZ4jx{H zbk=HV7l)Dxe9pCx^Px+vI3HM0ohsCsAt2O4WJAU%&0SmHIxgug#ExuEoY+_$U@A!` zB^=#;z|>N#bg*hRVTaVrmG@{2x7uB;Z`YQ#8Lu0L0TY-}c;DaL;V3zu2FuOpUFw8o zl{$*=!#FrFjvP8gqf>8YKq73TYf(_7rDd=Ikw$}PGlb+`;s*TJKM=B0yA6ZbtxX8-D!1DOyJ4hNs(!)u^H9_1=;clpJ zu=GB5^^Gm}v5S{&d-zCVKXM`Co)0-Y+ppSUN7tkTnMkS4{D<$Py7hjJ_S>H92mF*IE+Ypfy)N&K74_`P>V% z@gwAlu62oz<4OG0@`er}fhS}pL^y&2OF}XmR5e2<;%}5GYc>!1RJv0o*;b2JeNxMf zeX;I#u(-}Fk6DF4yClwh%Hu0T9-9HV>_mWJ#r+w0mzi42B)qp8E7vF;Gei}~uB)(S za|=#FUzd1^Z+DBcv+O?2S}hEdQ5RW<=XM1sxVzITXhwa~Z#z~Ld1=pbmTgGRIyM>m z9{BeThi7qa3HIf(clz>2wZJ0s!~X=$6SzdF4bZr6va zlih;p7afmivJ4UDJhSs~PcwVW!HS0bctaNAqXM7QE^lrH$c@4OhLuV_Jr_y7TBvO; zug;Hx+!*>K^QKJZ#V>xbI9s2`O2JieZsGFsnyBx5BMxSN&^Q8@4w7BQyV@SMPsF8L zqIp!;gZvO%oaWQPJ)B(k;7e8)0C0V+zV(t`r8gG75*^&%wr*wnhkX&%X9k+N z`=7m%h(pZu`QhRGAY0?*vNJEKx9BvG9v;8s@x89PjU$S*Cf@zRUa2}Wlj&1EdP8Ri zfSQHe>y{auw1>!aI0KDyY;$HVDVxtqa#qe_&7+(x;-ZkpJMkpCmy=3T4o>H)T+HW+ zrF^cE&q^3}%RLAqB}WBIp}Svm13QH|}WH5N&+rU{%@cn3=R$aG8R4gqXjZ{C77cQyQ_u^& zoXxCa(3l7Doh`7=#itk8+PM&+1*Jnh<;uLMwGOb8Q*++cw$@k8f*P^A`Re71c>Nus zyq*{^);3P6F*>v_{0DwoD$s~E$k6tkh=%nLw^6|&aR$iuXe?PeREFfsTxRagnqCZaA;9s z`|5%u&2O*FZY|Wewn^%9lAME(2SXbbN-Vz=P|o#RaJ-JLu-Z$;|jD%XqO|s-Uwh3J2vhN2PL+xGOm4 zj#2pt6w%nrI}I%JcvELEJR_HZV!k*$mw{d$OdSksht)U&YP-;zP!)K$7uNR-4A~p& zB)sxU!fTT9`D!QuTxAWsjk2X&MaJ8uN_jc7LtJHz)|h4UD23NE6+`K? zd_Ak|YSvp|!$cyq1CD?P31Cb{t&s|)QUPx}kc$|NLVD^baFPu!FM%*PfK)2v^T!1z z+u(BL5?)S*150zoN@zbFg}F-#ni=4ki3^A7|sP4NY!Fq9s|S2tX?kRMRR2dHkVY1$TBYUtZtTy<#Mh- zY!kJQUGU6q22%qTdV~HH%F&q61<={cm3*~OLYu4AaDCz9!$)J5VmCbAza+(YJ9w!8 zu`A+rawE6c1J3NN)?aWRy8t@68H&14hPhC| zJDx{I@pUnau^S%mUqkMo3Ck5CVT8fS1J5bPvQ(irTIVXc(&(z%1JCSV13$S^lJn@7 zO1^Sj0y;Vv4VCgRiD56~Bk+s|L9>5N0#-`Z929l2Nbj2uwmf}2!vH#YW_&sJf#m%x zWijJ~)mp8<1U){6_HbN~rE-CWcMQm5z_Y|SR{`zNy=B;4quX5m_S$j`^Ue6+RkUMZ z=#XQ)qdiw3KCBkQZU3Hok~gkRJq#M}TpF74`8*~jus%w~@UeI(Foy)wOrwA|$V$ah zc;^lU<`7@J)=Y-S3?oQ1VB<+f0CrA*oODu_WCDDQ)`pu1M**`2Hz;^?KoyfbIb6z) zf@aAC98EGj%;?<0sD0`-UZN`$H*RidoMvD*%#G+>b(qpf;j%RpmXq5G`4UawBzcIH zHWZkH+wziBme8LvIyF@41AsZ1BNnnb0dmUWHB|+fCaN?UJT7*&4#s4IK9x#t$S`(9 zTV3{YC0Br363ZC5@~8~w9u>!>oEsV^TP_Pbxmv&k3wA*G;NWhl2b3kEIq+JgC_~(( za<~CH6qY5RFx)HU3cLjXMPAO3fjvhEp|s1RjE`lLE1#>jZ3L z3^TO|#|I>#7!ln^$F&6miQs|C5jEx|l_JJNa&pOqVNz23QoWrs*mG zYtZKA5o!^T1(uKwX4RAo_?8*)y$o9?REFX@`TcXGflPbQ<2!Y_oFyZYN zsu7f7ccDN3V%Lb~xoxx?u6ev}i{v=0h8?R!Fd=+Z>>_@~42HATf{ixI1q8q4isT!z zMaEZt$6tKpnkmYGqI3{&h}azfA`Bu`9th!>_0LHg+MwscqEN_UE`_;csbtHep9F3! zTyRFl#Uj2pzRDru{+a_yjacK38)=jps3B_7S8Y(yER+cXK`5qb>(J`QBA5s>c^Qkj zl_3-i1c)Q4di{CCX)*1=w7IKuM&r-npkqh6hBrYD4nT!HPmYcbqe7T_R$i-c`SW)1x+ud!-`C+OkutpG>UOzoP z>O@!e;yHsC!t^R8*I|k*5I+xCZiM8^6~yc!HX@HX`P5#)7>p?dz zDp-QyfgcAsLSe^17o~YwX~9b9wwO&77dbp&ITc_VN!1e8{0cJp`Xlj~7o^#EfW%@a zt(_KfFh(PR^eTodo$hZ$yAB!71EnAklo2I6HYlo4QxAs)N!s@9Xaj=}T}K^cK_S+f zjrkWrUW9f@Rk?&d8<)J^N|xKK63mA%8d2mJtpmkbiDb8gR_F?O%)iTW9!}J#`eP`; zs2Z~^G#$mdEKGOsWT_BU^9`*=cDE*1mPiqnD_Adxwg0IeXLE`aDp)9ERcCAgb88Rd_2Lx`tMPOQmWtCs*LFjjA}qt8nNU){II;rd2lVx`N*_lGF^Z!|IG& z0TG8qgdXNgk@bIg6%PFZzjm3$N@AcHHd7x#NF73ffF`GM$yK0P5O-Cn3{jJc%Lm4e z;0}|mFl7P`2=rN83{#va5ZEw}sTzhy81IIG4Kr8cg6}Z|;{rQu=nbo5NEZcCe7sR4 zjNqXna16AM$Di;fUctd_VfA6T-&P!EUQ+mdIyS}y`jx@u!XCDdyqrKCxru+u{X$Jjk*h< z%Wd;zlmZUj4V%g)j2o;&H4e#Dunn{!xr!s@Dvgw@JW{U8NV%#Zxi+@0o6hgB{^ObD6Q~;l2@RS>tkLyxQf?%({|;s2Ea(P ziqvkYnnnwIVVRk}^JhnE4YKF4^cMt=$CY8(hce7Izmd)P2Q6G-4Z*om;(F#bdA z3Kw|ymh%0@x3Zt<6}pWw)*VaOYEuLc`kO1Z!qTgR;#nyrS$zN9`F4y?I(;bs*r($(B49 z*RQgMAwr!LT1Zwpo(!jRkCu74v!P)@b5rjdB+C{s8^MT-m>sZea1A}aA3xi~dw5tm zY%)X?RRzw?T$v&_k1Pl!Zxk$MtB769!<&QMA0rgxhgnpzi2g6KNL$)75>laY@S%+- z=q8nAxaDCv!EQRDl!at2D`5v0b`q9h$sJLMBbdu#Q(6h$L&Rdc3mM!55nk`&;DM3( zrZd}XjGh^NG-~GFR$b|PnbF5$X0FSMvFa{)S#9nBe+eh4)CR`ej}BE=JCEpMNx{5 zYC;w>=*Rxh0ZjT_H116j+e1Bo29Hfu`v#+;_>9wQAAtVHU-oYuKW z43&VK%bP~n88`b3VPUy*S=tQZ0E;*>svVP;gDxyYP8*Ckj!c>n3TMrjt;V>s4I0av zb-X>sbT_#lYD9;@0bYX?KtTvLBQ94aVG`;F=`F=oqG43Lwtj#kp0Z7bJM@aK4QRG+ z)K0|Z8JMLtECUXqYUv2`n^)<4F@%lLiD$DW^_`G-#=3&FWN4`xJ6YxSoRO6davb{+K~p=rr^uDqTS!l6@eNm4)h| z_M4LcupxhB9CL>AbL)7`14A|Mc)F=Hfgc@K><4N*!$E3lS5uXnY7amdG^4?G6$~e; zVV}#BgbSOXF-fdbZVe(s|*V*A{ngIz;?~{ZnxViM*RjIO=s2VYWn(o-A4~| z(iPI!MWxp#@}remZDC{CA@_jndb%~}?&w3FGnyWm$$0s*!55wy2AXlo-sWQc0HL3} zb6F6w6BC?sw-4q9^KLG%<}&AbCaw)UGkAa#nK#4ajboG%P#p*zoMI zgZFRu_uDO{27fMdv>sgtz4fc5G^n-fovnS0Nc%7`jdIxB;qNjhg3iLbyYABH?ITK? z)L_c&rlN)PhFKq@SriJo3yeTRvp`fTn#G)?h(gDJ;*lunZsusTTsZ7Up`Ul7>YwHWa59~$+Kc^~Y56mqHE*jAjGI|3h2({Mk?trW!_(GgrVBc@{ z+|vR3$v}IAndk2LWd?6iuwfcGxGl{G4?Bkmg5H&3X>0o_4hm)QU!hY8S}u2H=V#}R zSfED}R;#?RgoHI0n6h-ft6FDAoBELsEXH4)!z`SU$|wwVyczZ?vp&OchAgu=38ZRc zo}|UHBMEHw=;o@@yTLigdcY#j zaUe8QvTGKc=}{JqtuaE-)#loujfF-~&j~_nH+$9u&M^6n$FTocdHnGn+s!h7`{?;j zz;ox?zL||$V1ilbG~seH4+ZrBF`ov2U{qkoZF}j{1n^Czv!`Nwbxh^3znv!Kgr!!$ zr8*P|-NMw8?)hNV=IhIvHlWjpiOayY=2_b^2WD>XgM2y(3C9(7wb42*!saPu$Yjk}MzT{rwWy+PQhG zk$K*iFjBQU+>BqvT!zN^+hEZHyuSpm?X*XXQ5s;QFm&;9#0DuaFnXcKG?P1e6r2nw z#17hmM(;|&t5h`D$ExW;iVS6SaA=F=82YOZP9p`( zlh>+7(=XzU^N95+S12I+7%1EFo@MIC@&~|ZiU5wHKF60af)XlY(fFA_g~*r;OCw^_ z!i0==XGQ}RhmfIdtk!si%nx-OA@f7?3YlDrC=7m(K4GBEtTIl7joK@WoH2<4qLyM1 z1%eWjC=jT4M1dfMib6E00;o7d0buTcDEOg{BMN?KUQv+B5k_68uodkpw?9uOvv-h=YP3q+b-IJl?p0;}4>83ZrLCl7O8BF-QVI ziAfR&R4kG}fWjmpnxzFuaYzEd+yP1OLmfvF{Ls9T5LFQT5dDHsfnkId1ln{IL1%Kv zF-Zb;(8M4K1SM1wu(}taeFQ2*5@`J`BBU@;h)8tWXB2~%7GUmxEcl_0BMW|LURhwL zI!5aUPpf^hP%Y!75jamK+N7DI8j~pCq=gtnfuO`B3Ir-1Q6NZRq98|`Dgaa*q5v=+ zrif#~!Vh&EQSd|aib7QXksqR06e>9!KviI`(vBcv9BxdKfYU@`kOYDfDhbh;jzEP- zLNt{@fWicUb_++KIY7lB2mo^j1i=q=96|6y^9n-Ll|?^9uOM8cw~^7i@1o5FIpCNi z0h@E<5CsM%OctW9ixEm8!VpoHGql3Qp%~3>BUo|B1Hrii0ucaq9Ek`3=M@Pl@;aNp zQ8OqjGqi>;6dREV@i3=3qS!laE(n2JJppcRX>0nVL} zHb2<0q|Fb`I|fA_g8U%8G9k%0W*w(&)2qKDb`8rPmrP*AFb@PqTpMC8eeAEZ|%WLo8`;!Dx!bBpXQAVJ`L?R#2Cf9=ZB!*lVImQ6_yVX{Bm!{mghcqk zjwKO(a9)X!qAq?&ewbc~xF}U{bhliJEE61bTrz=`#W-Ywz{DgI1S%exAV^^{5pj41 zs90nIaPEXm_`!}P6Mk?$nTR|}_+k2G0vqnIr5{s@Xu6Msj!P!6#u$f85STETh-kVK zs4$s`IQ=3}VM0M~_KM=&0IXP~0&wnxRQSP;B^7>fUa63y#*0XPn0~2%H?~xzQz0UE z4kKzzQX$i_^!Qs*8JKvaf`N)hDj29xsfadbCs=Vv1;M!kQV{@l9H|HZ=aY)48!iH1 zdZj`_^a$R5iFcny5(*1)Ttb1x%Q%FBz=R1!G{>h*phAQq+DO45g~@~zZKNPrvB(7A z+zFZRgB?pI{NTJYAw^xIAPRa~?vsfeR_>}eg(#xF;*jH#39JLgArk~9CYc~m@yG;0 z3YCe7bO)?hWCCzJoDs(&x$Fl!mQ47;`D7xhKQ;hGuT0=&vUtl+)Xmx)*SI7C>x*$n z1c3>Yh-gd3GJy(_h=_J30~IC`rD(1vf)$HM0M4Bd2|w7eM8XfwD-!vrC*}|ZJuUZ& z1ePeuQc1>J-J@+xvOvcr6<7$2Ln;VNOj1FhVv-616)F|cW^yuM#Ud4e6cGszSg{BN;M@tJ@Pi#oDE#1jLJ@Uix9o@M6$)7@NENJR zN41|gn52S0g-S(42?MNHqylg}yb;T!!Vh*VsqlmIN`)MGs_lpAl?tgK%eeyH zOCL=pIN+FMA|KCsHUkqT6HyPWB$PsgLXI{OU}%L2MKPM=k6^_i6a?oE2t@$cafBiO zoKGmC4iy0~eL{g3f6)6wBE~*hkmHgHEC$9fQsfCtTv9=>Vv-616($vhXm%YnF&3!+ zoI4>Eez0Rng&&+(Dx}C#;fLv!iieI@8lC#Qftp ziyVCF$Aac<)yQY7x^Nn^RagmjwMx!b3zb5lc)V5tny*#SX#*%@whD`cu2$ufHG#QuoB~GRT1uMNh*{|qfWnUbO6lTs8PoU7mzXA1V_53 zP1rS%mk{|TmqxV+ZF~UD+o~zXFRU@Bh z<$^M1t1y-HwyIJ>;9L>prcJZd{{uRww%l7ilyk*h3BLcX5bz}^$?hGJIrff$XOh9 zUcrf;WxOjaIwuC07$bXm96bgw<2KA|Ww3XMmdY}k72W8O0<4UYk1BAqYkWI{b|(8c zv7CeQz?Pb0aAJU^aZE7dc9_@LAdOfk;8mqLEI35x!~kpK82miGwIO-U4MeT--E)H(vthKK#v@_* zY8CTi%)CZED#^x(z9`Q1~YEM0&EV9%{ZAeSE@$0Iy`%63@^>`z>M24ztxe4 z<|vjVxjJ$Lo`ci@X2-~qq$FbvHSwrmvg$M6Gqf=gMC9fzaGw>v0? z%+s)ZNrr<1!&u=MQXgP<6r#I02&-{r%x`z(OSwGR_SIvUcodlBWyO(WyBw@IzTrW! z9G)J-BDIt&utUlt8&sY^9b?WC-J&8e82MBmM~?x_ zm<^+aPj8RGDS-1-N~0fKYGXv;e5^R?GPxVfxDE4L9x$cgoi7)$YbP=*vXxkS$w|-X zuG;IZDxId%*Sojv_3`7Pp!B<~!?0z*5gTN@6g~{dC_@7pr&#n+l!rqhL`Dr(7!GCV0{oobydAB1gqR29 zk&Exq5Okj}JvTo#e%_V$GJ;Wx1+2*UHO>*3ZXMPeil#0q4anX6Mz*fe zZovtzfHyHm#*U=G)&*Innzn8qgld-~8tVbag8ZO!2D@nepsRM7vYDBwx74=U=|$`% z$a)OyO;FH^eWlY;!G-wS=&MbJShU^%AP-Z!V;YYkCw`FQ$cZ1CCq`v%J}{hdNFVz_ zdIjZTNiJc6iM8`cyhwn3lCs=32B+wf;sfW9l#976CLZHqN8l{%N_a&{WN;oy8AtGV zr#R#|4#5N1$fFZ$l{VPxWjF9QY)}n|^LH1d0b{wjp>+FUApG1F1_7`b7s*Bc9MMw- zEgrL03&LZPQEQ=6_Vb3PTKJ5{tk#0ysMbFgc&ddfX3T2g^B5BXoC{+@z$|`D2$(L9 z2?6ucF(F_AI3@&IpN;_H%5t{3epBr#nzr0l_Ed+BVhn{@mkA%3eO~W&F?hE0j$zz{ z6`URPb>h9csv-NbrVTcgj#(vF@J{`S$`k~+S|vud&w~mwRWib?E3E7hbFQ#rQD}V_)mxB6Yj_jf^VdKaN0Pw@!##|JVehsvP_?^QL<_@T9(YGnx1Rc~Av>%A zUU~GCk2ZSy@sBok`^!fgx&7s%jobe6(MD~5`DkOdzkIY2+h2a9@p@r#xxTS5pY7p- zje<&VYiVQqTC2Z5mkVPYn7KpB>4W}piZ1N7YD!OvQfF^tU+Jm6{nl=Ow$r?(w)Xb> zJ=pZ2zBWX8`4I1|@9y^0J{4=LeYKlab~WSHYr}AXy*&aLZSH6=*IEawYI{!88#l6w zw%AqGusTb?-IcpJEVqQ&r#tti+6x1n&kP!K>nk-PL9?~jp6zy(+hLgzLy=)L2C=%W z#;S5~(CX}E2Mv>e#y!JO%k-8v(X3Tvuhm$tg+es59n~I!b!4Mwds=p&Ilb4*9T=fK zn2<)Zl-`2Y4nu`#taoqhYWgj5RCRW>R-<3nH9)BMyGmc(yUkl>Y8L3?DcBj_rVLspOF19q1 zfI-7aK+}=6YFHzK2LIhUL}N$D2;dE++jQpK*t*UdGic#zOX=>ZeK1K-jvg|sZfw@k z5i0LM_9Ltdv6>D3G~ox7)(lXMC)T@7G4T4V27PsVJKJl$!SqWW0i`qBYaF}m zh8`-4M_}LEsw;i)cYR;69jKnhvSlAmD)u7^qR!GCsHUAEG@|(U@XC7|LvU57;8k54 zv_lmhfG>A6EJe+Od0P6yO-0LU#vNQcdBvY?O<&*L=&I)G7X;9kMbS3Zh7Li!T^m5K zp&ml1G*W5q0}8IIuCx!(l)dBC13-Sujck8AKsolcm4OC-Hd1=jWx#N7!SbtI1 zpx$^chHrQtH<|5x8n0>&HSS@dvD0J=fUIVi2hMt}ymdI_TR^qdx3p+rzD2EdFx%PF zOi}bTgXzz(Hm>SDrPT;~TEEdc*x0{~;b5!V+S@ai3)u2#t#zn(46W2)*ES5_BVxt@ znN35Ep-n^&CGdbXZ;^4f-Wi7UFd(4lxFELVdS4kz=V1tJ zj#pYe(7QHIC_N0+?N(#9`Hh1f3=Yzrq2mZ-#vJDwaEX};4*SEkerpa3^iFSDV9v$$ zt93G~T^V2e;unjv_4)brwX5R7T1~|788IW`^CtX#Q|3(>!TNmAx-u?~`};DPq-v*6 zibxPIt+d*${zKv$MJ(`@Ocph7U3_U#E&Utbc0d-0t>T1A>N z6Tc>D#VzY|TwFN-UVagJ7U2{JPQ2#N;#qABj&7JKRw1GUpoP5KcHMR8`gtsHDC zjVCVV+ig4v!hI5zP$Jyv`%$%rzs85xJMM4TCw;8+G?4mwL)Qdx#&D;t+Qja;FgN;z z`CQRCg!h##L3q9cug%5!wc7fD$%?EuduaVkzdOJe@sIK0SH`#TmmV@&)CD00n7kj; zxTZw3pzoLG3>dhyK5@~6;g7MUKi|a=hUNCz-T_?A!ju2o-RECBw|OmU7qS?Q6@mg>7iDtbG3`X$^r}WT&h7|cA8K`x3e?S_Uuiir71ft4F(!$_(rg;4GI9 zsan&!ZKm{HsPe}{VKRzx`rCM@P`ty)90qgY5cVR>G6$*xlfV--*DY#(%6I%BMjZWzqj#uI~j3gjbO$9|Jr)FWPU^OMf-y*PPO_y!5T5!!?>@g$|HMAo1TYrc>iX`cx~q0+ zaK5r!6Heg1SMQy{JxZ(Qx0K6eof^!m7jxN48RtA;11$=epHcOIH$Q6+rdn$ECdj0B z1R5eu0XvP%B2VvFyFq0qKyUi~3O@8rG@6GVKZkFw$4LiLqV~}D+B*K2pX1``Ex&~T zwVp-U4gHqdMTOT^2p^F$Ph3^#OPs0UZkAB7J`+M)rF=*AL#aZ?YE%n20wre^#zkCh zU{fJxR+ERXXU^$`)vrto!uITZomksh?^EwIc#CkaLl4lmm-b#;bZvcY!3iTS-0RA& z+P{s-45D+pO@VYiBtBS|kc}ns#Q6xny|{aA(dC7Cgd7CXobFK~CZliXCsw>2%8Z$Xj(LWGLt5HL8Wrm8g82W&RR?n$(Cpli)M zLa7>OfivN_4E;j7h3l7LHWRY3Ph520T=cf!i;zyjt50SQ4OQ9xoYFIDHUB;+rpf*j zkqG}Qz@MhF&j$W{zUyrT&XjP>u7RBSF?eFOCgL|dNo&tG{ww5{g2~grU07Wdf?VtH z4qo1zt+7={rFVH_mV9osSl(M`Hg6ODUcRR>m=eh=eP2Q1pSbvDwhO#lidM&J){3&D zlDF0=v-{02PA~oSJ?AF`1J}76S#fHILvh)Po`EdPpw6M7q1ao$l zTFth6XjcJr7xP=1PGPj@4oxD8>`qRogvF!>yc@=@EJFsXxuiGE+ieZI_s zOY1WsY%I`}m-7NW2*Q)@KOnsRIV%_|X3xp*_YwW9?8~XO6O1E|68>+?J_D-wW z+wQBqK43n1qK-az1QyfE`dvbOd0};7tuC%^ic1T#wS`Ub5g~PMzJ6L*UY)(XAg*0m z6$`kRK1Wze*&MDjxK0UMvtmxl%JL(^Df~_drNYZfh2A5=Y5Y1ZEb3Q9xsZDq5=xvo zS6@mA^$l^Unl0k@S>(AOKrcUpRl55nn41qgR9E)yzlaJR!1HNV0}y)k(!vIQ--Ys% z!q$eEdqg$h5vSojBHV{3Cxlt)Q*S;etX~76 zMWocT{qHw2A4q)dnWvMVKmGH`rzhW=Sw4}=eCW`AINjQ$Kj(JrlPk zo2j$OtC>gB?>YI(WH$As)FAyoPQG*MHOZ@|ekuJ2r@lG)p5)I?eJt@~$)8Jq&6(8H z;mN5p|KZH*&wR&;51o1KPrdb2 zI{6=yKb3k;^7~TnO5c_I%k-;G9VTCs`t!serN1rt%adQ3*q`j4e(TBqYwF`?>r*eB zesiKS^+Qw7o$jZfKPjX>k$&++_GCW!e@vH>`zOCR-ARAlZ}lRr6qYD!4` z+}Wv9_ay!?`Ge_Cq`zVEbBS+F{K&+gByz|XUPwpI6t{2 zoJ!u8dhf{}PCT6cwd9wlnu(t}JxKj#@_SRaQu)-krS3`pZt4$GKb-onNiF@&soM!X z`7`O4rhY5=)0uxrE}i9LncHWD)E}NYJ^4N9SDyajgqHl3EB$88aOT8-b zboyP1Z%^(dT2m(y&D1imES$WVcq&~^y&?6S^hc8Y^u_ee)N50pP5x|hZK{+0{N%@y z-gl^q|G`v0{r{c)k;EGlyQfY}{n<$&^?eyN z{flS5Z~AQFKb}k_E}-xKO8EZM(&U>nO5$5GA4{H1zA(9z{D;I#r#_VU?({DuW)fdM zbs_PAbRzx!?Qt3s*&kH`}}?SbrZ$pucTg@`NPEW)WwMpCkn}W z;w{PFNj#MNg+wixNPb)L&l8oz_fEcTYB%$AvXr`X>W>m1pZc!UjpUCee>wS~W`E%4cyHou$-5HYndl|o z25jCxl~3N7x|sM+i62W$C5w=_wDf1W;@`R&AH<_nV-(%+r=(Gwq?{O;tZrcWmRTl&uuFG{o%XA|F#`fsIPm}pGC zW%AckgUNrF{5Q$}ck%^^Pfz{tse_4sd-_G0kEZ@*>NClw6Tb#3Jd!-0JdynC#M$Ip z@|%`=692yObV$?>TK#v>IFzCVDPue_0(^r{wn?3ncq%-IP*>EkEO1IN`Hga zy>IeL;;pHB(}~O*)7f+<^HS)apO|e9={EfsLCO?{aZ>E!e zf95sm*JSQV>#3guPrWXAIay6!NxmWZEx_hik{?WdBKf)G-=ghjQqN0?z~P0dbNHJ| zO#z3=^mnEIb^1qA2kClR&U`3wPwM+pzn8k2`LpD20qV!opGp5f`v339Jm91#mN34z zJKa^?vw9uief5d;nlNK%4`ih!bIOh^Z_{mp5iW|6@x30tK`P*=APK$=&eCBb0g8n6?9d#V7NkC z`>w8-9t3NqOp_7vu?&Z!WpKVj{*se&4g4!|gQ{?=mej&n5!XO3%0*(5CjHg>TitMX zw|mn+SDHOiuH@Jjv=z6$^eQosD#(y6@oM>jUd^J-+21olx+iE?xSxwkZpK&4G zC703PWGf!8Kyk~l(1Z{0Uy0{SsNP2mKqa zVO-5JDH}$#mR7ZAb+#$vAJn%+C!2JFR%6_|wICShMRXry9+q42t*F8- zJm*oXG5)=SnCIgNh(JlMJqh*&S&yB2WuI)*dwm{1%-8qgH~KR4ib>%BvU^5Y3qLX< z#dN=B=$(lp_7Ul0Z<7-!@C`FgpCOv1@Tef3B|r@Itd!GJQix0|q*1)iRa-GF1nQ-m zY}_F33jP5(A$y3=|8DA}x9jTu@E6?%SJ?mPJ_|c&{@?+<;_q|!#UnaPFNHnj<@nR` zWW0vlEtTvr`N9;CiQp9ny%5SuX-J$^tAd;k!|D7E^l!0xo8HI&x%qchV#MQ!Y=@x( zd>*AX9pE@B8{|z_I-KvWx_8_;*TM~mF0(oDfAqufIrLx8U6NPbt@>23N|wa)$&^$E z+Nv!nMoq`>f<*dO{L70qpAlW zv!-EMDlF+<@lU(<;otg~9cOcV4zsF=E@#%T&{enP^l6(<@AF$x*>>2xjH1s`RZoI` zUh<*z+VGzVdZn9{?gHBTOYX$N7#VVjU&mnkJJ2_Re~`*}L{?LuK9`?mlPuBvuBIR1 z>iMVQKkM*tl%xm0x|24dedz%A%G=R&t*#@arhi7>)D9>&f`@Uv01pa=!P6o))mWam z=Ob?}qj67E@?y=usO?`O{*COy-{s`$DfyQ7AF$^~SxeT;m5I8|mvCjm9$LW{bQzYJ zQP`Jpmh1Cwr77fE$5PrDw0BV0X*hFwS`6MN!LY>QA^6S2*LwU;;(2b(kE4O$X4J3a z?Kbd_$|}jG7ao>hWGONDOb&4-i+`%Rcinydg8M=9#k;xuzP0QP>cD@>&5`{k1Ip`Y<{He+qgVB%jJKnIR)#e~XOciZ9T?H2E(%_`0;vq?<|iY2_;WvszVl zgMZa_v9G)Hcv~sH9J~L)@*ViS8?UlCHlZn}`SmyAa0MM4m3!b}2bibfb|bjs;Ak@5 z|CeYFmPvHUanhBM_QJ+qVm}vq`k=(#WJwceocC9?UHF(gXD5T7GL>vkw?o%RQU3$J zmezGxzMq(GL{ZzwqusC)rRtx-Dyzq(Fg)b?zm03MV0j!UV`UU`k_pTb`oa2#(gx1I z#fD!|#4I@J&CwZ8ZucK)3t!kx3s326y9fNC=3Ytq*;+;#y3LY@io1x+&B2B&Mp!3n zh-v}!X2`WuX#FIa@h`}yVg7%3y$Gbu(u-Nir!pG;`@+K@X$cE+i1`*GGm)!$6P<2w zd8coy?}C3Wyv^;mX=u2KSt5IaH9FW@X49om|4ick4HkSsByu&JjmPJS_6f2g*Y6w5 zIV-C^K0teC@$d>b+N6gzqlmwwz|(Tm2YypM(lw21tK z|Lsgu+2EGxPWvT!cbsv@l1pDQ`a)0^gP%neK1O{xjW&LP!yRt3l)1$X(XZ`(H;z0nrro*c6Occ_f=_NL z+fnDX!^|dF_#X;5f|?It-KXUGP#_MJHTT|*0;g0a%8XKNLe@ozQCn}msh3JcoLt*!2_q`PI?X-yRs(;%) z;8(MYOjpec=E=h7B6ON1FLM7>qA(NIKH;v#s7Q4-5uZR!nnVtK&HqDTViM?`@Oip4 zkjIEXH+cc?TafLos6t;8@x53v8=Lzu?l7|JH7)9wy2rJgo2WbNa_~#&!=|woG|MI3 z93)5AVSgKRJ%G{wgBHhg=M*w<68i54{%Gc+bBKC-Do-!0Xosb_cnzdDaeq$QvO;*B zNDM(E1K>Z4I4pptZ?UEyojt!FPe)&%9kmzhsEAf^IlVk`C}VXo4E9AogSmeodGH0=-Awf7 z@O>0DJ_noLhv!-3cz5uJgE~}d;Qym=*NAn}bJ#Z<+or;PHatzZS*P!HwZee)5S2gN zqkTl}_w>K6^1W6wuj+fjaG7s%>)Uu%)171tS{MNTtH^{EoV&vJ9N0dGGN;I^sID<* zJ7Z^8P+!KL8gds=X$X2#uAPEb1#72 z4xYxaO8kT>u#>x1z+C7$yC2*Ue@S}!`*on}Xlv;*@TU-?F8W*$)hA-p=}^P*WCi07 zAll=J^kf*EM-QJ&_VfdND5Ji}zioJR!@hUnsuhZ9B2UWwQV#E*LWcv1&}3Mc!JTWl zYmR)U@B7DGW1r;;$}c`)+u5~J2>e&vIQu}nVNk`6B0D$OXSEO*)$RK*wuBnDkoRTS zKc31rjp&a>^_@{heei4Wt2v)d$o&Q||A3SQy#@GPP+;yVc|KXU1MO^aXWfr(t2^Y* z$7jeOmua2N)P=-9k5tiw+~XDmee^rqTb>Ap65E~fHeP>E1UK-x9$Pj@fBMH5wD>gH zR}YMb(0vDNssmf~L2bmVEPGWw$=ePv)StW=!?B-va#c6h^>js;ftT|?`TX`3>R?>k z;r|Fb4<%o~qjBbd98QZ$v7oXHX6GuKTCkBUUJr}kqV;*Oun~WUvD5k=c?8upMe%Lm zqavEG0`m`}fI9Ll*R;Rc4aO3oui>nVD-|ARKVY=G=%%~S6gBmk$4zfo`+Au1j4fzmi$!W%_Q?&+_Y&t72{alkC5CPc;qeq_U~STg}DT{OG#isL9X@ zRQdKYSI5xz)=`0$ptO;ozb7vd^=6Fy7>E@l1FuU^c^+h!t~41~nw4WC?5GVxZBfA- zD*bP)y&Ah!Snx`Gf{u0ZaH(tV6UpAOY}UVdlRZ;U$)V@~v&!xXPDSU%hU@dFsxa%m z7SfC^{SBRVJiLEQMS2Ui>tRD3R?21McH(`9q{}T*fc{%rijaQ~U`0(SCAEk|Ketau z>$iGF&+2?PB5dtm^rym8?pRpbFY~j_qW! zz#NP2-=^|3hubHKdOH0^Bw$}nBn1O;-V1Q91iaiYXKl9b(_L}+97L<3*22;|k2)>Vf7n_^b%HD3S z&_jC=^YZM+)`$JdQXZtc@HT}FF?QO{+cWl}J!fmmDk6SJi?g?P(Cv2V%=cRAWIG`` zHE0^k2{P>vbKM-~x`SqwIf4G4wV&%HE$!a(=d~17>~Hgu8DfT;X{NO)Yo0a5%%0$| zd4^2fb+aS)lRNBDI`^G0ogbf*>`h;@EA1clc~->_X&s%xN_P<%SXkFaUXR}sHi~6y zdA~tdCP&5Q1nq;*Ookb1Gw6ha*nh2VZ!tBci7hV&Z4>*RZ4uN3`*VATZnj0WqExh( zsmc+Vpw*domSqp*58Dy_b?1139IHycSCc9Bpl+j5J|hL$wH}~b?JLY~*}00p9q(zU zrJi=x*_)3i+qoKv>-K`Ht?Pmu(}VgqOn2LX;Y@$c?lNiSQ}c+OX6ghDgF`Oc2R2RD zn_q1+`ui;D!4Aw0zImQ@HRAVe^yeG)3CTzOWAz!~@VgPAB% zdcaktS~m8*ef#8AEt)zW&ev9nM|DbkbbPgK5u0H0hs*qtXhaj18BcIRa`bxat=M!| zH0bYt(){*V!@Swe6mq~9sgRE~wF8*mhWH&VkCQ}1O zYJYRqb~8=oXS0S}JR5dStZ@~>4K5X1PPM2QZuRH=MSD$l#Ur+3g87HeN$#VnEM#wL zpq=PP+jFVBu63kSe0BxS#WKk9N5XXJ;2Vf>`K7=84xh){ZumXO z^pul!wYEt#r9ZAUhurMQNY|MDm1R3bCMAE=>84I(uYJ-+qaQ?Cr1toF@$KPAo+=Md z)S&*x!*r9KIxOqW>wcK+mAaE!IVzOYuJ|db8q3SmxLfVF_G5M~*>w&EO9z`N@AI_w ze$xyUl(ideQCVT@%j(o)+SwfSZ=tB~u;>GsrN1Wb2`kv*;ZweIWPqvR>bUM{FD5gC zhBDo^)C!5w^uLewmtcXdYtKoSAQ^t__vrNK@|d%Y-DcK_4gEYf*iPpuSbjUoZ2)7T z6b$mpQuB>;w)yPa`mY_wlUXZIlRN41Sn#r}x2uyCc`CY1=f)RlPHa`$z{nH6qt1(t Tw#Sog Date: Thu, 20 Oct 2016 00:35:46 -0400 Subject: [PATCH 023/181] ENH+BF: make dicoms reproducible, simplify API etc --- bin/heudiconv | 31 +++++++++++++++++++++++++------ tests/test_tarballs.py | 34 ++++++++++++++++++++++++++++++++++ tests/utils.py | 6 ++++++ 3 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 tests/test_tarballs.py create mode 100644 tests/utils.py diff --git a/bin/heudiconv b/bin/heudiconv index 01d2e379..73776d0c 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -501,9 +501,27 @@ def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False): return niftifile, infofile -def compress_dicoms(dicom_list, prefix, sourcedir): +def compress_dicoms(dicom_list, out_prefix): + """Archives DICOMs into a tarball + + Also tries to do it reproducibly, so takes the date for files + and target tarball based on the series time (within the first file) + + Parameters + ---------- + dicom_list : list of str + list of dicom files + out_prefix : str + output path prefix, including the portion of the output file name + before .dicom.tgz suffix + + Returns + ------- + filename : str + Result tarball + """ tmpdir = mkdtemp(prefix='dicomtar') - outtar = os.path.join(sourcedir, prefix + '.dicom.tgz') + outtar = out_prefix + '.dicom.tgz' if os.path.exists(outtar): raise RuntimeError("File %s already exists, will not override" % outtar) @@ -533,13 +551,15 @@ def compress_dicoms(dicom_list, prefix, sourcedir): # place into archive stripping any lead directories and # adding the one corresponding to prefix tar.add(outfile, - arcname=opj(prefix, os.path.basename(outfile)), + arcname=opj(basename(out_prefix), + os.path.basename(outfile)), recursive=False, - filter=_assign_dicom_time()) + filter=_assign_dicom_time) finally: time.time = _old_time shutil.rmtree(tmpdir) + return outtar def get_dicom_series_time(dicom_list): @@ -609,8 +629,7 @@ def convert(items, symlink=True, converter=None, if not os.path.exists(sourcedir_): os.makedirs(sourcedir_) compress_dicoms(item_dicoms, - os.path.basename(prefix), - sourcedir_) + opj(sourcedir_, os.path.basename(prefix))) else: dicomdir = prefix + '_dicom' if os.path.exists(dicomdir): diff --git a/tests/test_tarballs.py b/tests/test_tarballs.py new file mode 100644 index 00000000..1b8c8de8 --- /dev/null +++ b/tests/test_tarballs.py @@ -0,0 +1,34 @@ +import os +import pytest +import sys +import time + +from mock import patch +from os.path import join as opj +from os.path import dirname +from six.moves import StringIO +from glob import glob + +from . import heudiconv +from .utils import md5sum + +tests_datadir = opj(dirname(__file__), 'data') + + +def test_reproducibility(tmpdir): + #heudiconv.compress_dicoms(dicom_list, prefix, sourcedir) + prefix = str(tmpdir.join("precious")) + args = [glob(opj(tests_datadir, 'fmap_acq-3mm', '*')), prefix] + tarball = heudiconv.compress_dicoms(*args) + md5 = md5sum(tarball) + assert tarball + # must not override by default + with pytest.raises(RuntimeError): + heudiconv.compress_dicoms(*args) + os.unlink(tarball) + + time.sleep(1.1) # need to guarantee change of time + tarball_ = heudiconv.compress_dicoms(*args) + md5_ = md5sum(tarball_) + assert tarball == tarball_ + assert md5 == md5_ diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..4de67fae --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,6 @@ +import hashlib + + +def md5sum(filename): + with open(filename, 'rb') as f: + return hashlib.md5(f.read()).hexdigest() From 341222519ef61e591b40f985c9d64c2b0c3fb50d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 20 Oct 2016 21:41:58 -0400 Subject: [PATCH 024/181] ENH(TST): very basic smoke tests for converall and dbic_bids heuristics --- tests/test_heuristics.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 tests/test_heuristics.py diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py new file mode 100644 index 00000000..b60ab56d --- /dev/null +++ b/tests/test_heuristics.py @@ -0,0 +1,37 @@ +from . import heudiconv + +import pytest + +from datalad.api import Dataset + + +def test_smoke_converall(tmpdir): + heudiconv.main( + ("-f heuristics/convertall.py -c dcm2niix -o %s -b --datalad " + "-s fmap_acq-3mm -d tests/data/%%s/*" % tmpdir).split(' ') + ) + + +def test_dbic_bids_largely_smoke(tmpdir): + args = ("-f heuristics/dbic_bids.py -c dcm2niix -o %s -b " + "--datalad tests/data" % tmpdir).split(' '); + heudiconv.main(args) + ds = Dataset(str(tmpdir)) + assert ds.is_installed() + assert not ds.repo.dirty + head = ds.repo.get_hexsha() + + # and if we rerun -- should fail + with pytest.raises(RuntimeError): + heudiconv.main(args) + # but there should be nothing new + assert not ds.repo.dirty + assert head == ds.repo.get_hexsha() + + # unless we pass 'overwrite' flag + heudiconv.main(args + ['--overwrite']) + # but result should be exactly the same, so it still should be clean + # and at the same commit + assert ds.is_installed() + assert not ds.repo.dirty + assert head == ds.repo.get_hexsha() \ No newline at end of file From 968451397cee7dbf18c6df364a568fa41332f5f4 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 20 Oct 2016 22:12:54 -0400 Subject: [PATCH 025/181] RF+BF+ENH (BIG:-/): --overwrite (to remove safety protection for overwrites), unified some names, etc --- bin/heudiconv | 372 +++++++++++++++++++++++----------------- heuristics/dbic_bids.py | 6 +- 2 files changed, 221 insertions(+), 157 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 73776d0c..fce9f99c 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -52,6 +52,10 @@ logging.basicConfig( lgr.debug("Starting the abomination") # just to "run-test" logging +global_options = { + 'overwrite': False # overwrite existing files +} + SeqInfo = namedtuple( 'SeqInfo', ['total_files_till_now', # 0 @@ -110,7 +114,8 @@ class TempDirs(object): lgr.info("Removing %d temporary directories", len(self.dirs)) for t in self.dirs[:]: lgr.debug("Removing %s", t) - self.rmtree(t) + if self: + self.rmtree(t) def rmtree(self, tmpdir): if os.path.exists(tmpdir): @@ -522,7 +527,7 @@ def compress_dicoms(dicom_list, out_prefix): """ tmpdir = mkdtemp(prefix='dicomtar') outtar = out_prefix + '.dicom.tgz' - if os.path.exists(outtar): + if os.path.exists(outtar) and not global_options['overwrite']: raise RuntimeError("File %s already exists, will not override" % outtar) # tarfile encodes current time.time inside making those non-reproducible @@ -543,6 +548,9 @@ def compress_dicoms(dicom_list, out_prefix): import time _old_time = time.time time.time = lambda: dcm_time + if exists(outtar): + # could be under annex and forbid inplace change + os.unlink(outtar) with tarfile.open(outtar, 'w:gz', dereference=True) as tar: for filename in dicom_list: outfile = os.path.join(tmpdir, os.path.basename(filename)) @@ -584,7 +592,7 @@ def safe_copyfile(src, dest): """ if os.path.isdir(dest): dest = os.path.join(dest, os.path.basename(src)) - if os.path.lexists(dest): + if os.path.lexists(dest) and not global_options['overwrite']: raise ValueError("was asked to copy %s but destination already exists: %s" % (src, dest)) shutil.copyfile(src, dest) @@ -847,48 +855,12 @@ def convert_dicoms(sid, outdir, heuristic, converter, - queue=None, - anon_sid_cmd=None, anon_outdir=None, with_prov=False, + anon_sid=None, anon_outdir=None, + with_prov=False, ses=None, is_bids=False, seqinfo=None): if True: # just to minimize diff for now, remove later and dedent - # - # TODO: Also better lives outside and just replicates all cmdline args? - # - if queue: - if seqinfo and not dicoms: - # flatten them all and provide into batching, which again - # would group them... heh - dicoms = sum(seqinfo.values(), []) - # so - raise NotImplementedError( - "we already groupped them so need to add a switch to avoid " - "any groupping, so no outdir prefix doubled etc" - ) - # TODO This needs to be updated to better scale with additional args - progname = os.path.abspath(inspect.getfile(inspect.currentframe())) - convertcmd = ' '.join(['python', progname, - '-o', outdir, - '-f', heuristic.filename, - '-s', sid, - '-c', converter]) - if ses: - convertcmd += " --ses '%s'" % ses - if with_prov: - convertcmd += " --with-prov" - if is_bids: - convertcmd += " --bids" - convertcmd += ["'%s'" % f for f in dicoms] - - script_file = 'dicom-%s.sh' % sid - with open(script_file, 'wt') as fp: - fp.writelines(['#!/bin/bash\n', convertcmd]) - outcmd = 'sbatch -J dicom-%s -p %s -N1 -c2 --mem=20G %s' \ - % (sid, queue, script_file) - os.system(outcmd) - return - if dicoms: lgr.info("Processing %d dicoms", len(dicoms)) elif seqinfo: @@ -901,13 +873,10 @@ def convert_dicoms(sid, # dcmsessions = # - # Annonimization + # Annonimization parameters # - anon_sid = sid - if anon_sid_cmd is not None: - from subprocess import check_output - anon_sid = check_output([anon_sid_cmd, sid]).strip() - lgr.info("Annonimized sid %s into %s", sid, anon_sid) + if anon_sid is None: + anon_sid = sid if anon_outdir is None: anon_outdir = outdir @@ -987,6 +956,25 @@ def convert_dicoms(sid, sourcedir=sourcedir, outdir=tdir) + if is_bids: + if seqinfo: + add_participant_record( + anon_outdir, + anon_sid, + seqinfo.keys()[0].patient_age, + seqinfo.keys()[0].patient_sex, + ) + populate_bids_templates(anon_outdir) + + +def get_annonimized_sid(sid, anon_sid_cmd): + anon_sid = sid + if anon_sid_cmd is not None: + from subprocess import check_output + anon_sid = check_output([anon_sid_cmd, sid]).strip() + lgr.info("Annonimized sid %s into %s", sid, anon_sid) + return anon_sid + def get_extracted_dicoms(fl): """Given a list of files, possibly extract some from tarballs @@ -1052,8 +1040,8 @@ def load_heuristic(heuristic_file): return mod -def get_study_sessions(dicom_dir_template, files_opt, heuristic, outputdir, - session, subjs): +def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, + session, sids): """Given options from cmdline sort files or dicom seqinfos into study_sessions which put together files for a single session of a subject in a study @@ -1068,13 +1056,13 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outputdir, if dicom_dir_template: dicom_dir_template = os.path.abspath(dicom_dir_template) assert not files_opt # see above TODO - assert subjs + assert sids # expand the input template if '%s' not in dicom_dir_template: raise ValueError( "dicom dir template must have '%s' as a placeholder for a " "subject id. Got %r" % dicom_dir_template) - for sid in subjs: + for sid in sids: sdir = dicom_dir_template % sid # and see what matches files = sorted(glob(sdir)) @@ -1094,8 +1082,8 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outputdir, )] = files_ else: # prep files - assert (files_opt) - assert (not subjs) + assert files_opt + assert not sids files = [] for f in files_opt: if isdir(f): @@ -1129,7 +1117,7 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outputdir, # actually probably there should be a dedicated exception for # heuristics to throw if they detect that the study they are given # is not the one they would be willing to work on - ids = heuristic.infotoids(seqinfo.keys(), outputdir=outputdir) + ids = heuristic.infotoids(seqinfo.keys(), outdir=outdir) # TODO: probably infotoids is doomed to do more and possibly # split into multiple sessions!!!! but then it should be provided # full seqinfo with files which it would place into multiple groups @@ -1288,7 +1276,7 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): # ideally we should save from within the subdataset, # but https://github.com/datalad/datalad/pull/987 is not yet there # so for now saving everything - dstop.save(auto_add_changes=True, recursive=True) + dstop.save(message=msg, auto_add_changes=True, recursive=True) # TODO: they are still appearing as native annex symlinked beasts """ @@ -1328,14 +1316,156 @@ def setup_exceptionhook(): sys.excepthook = _pdb_excepthook -def main(args=None): +def _main(args): + """Given a structure of arguments from the parser perform computation""" + # + # Load heuristic -- better do it asap to make sure it loads correctly + # + heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + + # + # Deal with provided files or templates + # + + # + # pre-process provided list of files and possibly sort into groups/sessions + # + + # Group files per each study/sid/session + + dicom_dir_template = args.dicom_dir_template + files_opt = args.files + session = args.session + subjs = args.subjs + outdir = os.path.abspath(args.outdir) + + # TODO: Move into a function! + study_sessions = get_study_sessions( + dicom_dir_template, files_opt, + heuristic, outdir, session, subjs) + # extract tarballs, and replace their entries with expanded lists of files + # TODO: we might need to sort so sessions are ordered??? + lgr.info("Need to process %d study sessions", len(study_sessions)) + + # + # processed_studydirs = set() + + for (locator, session, sid), files_or_seqinfo in study_sessions.items(): + + if not len(files_or_seqinfo): + raise ValueError("nothing to process?") + # that is how life is ATM :-/ since we don't do sorting if subj + # template is provided + if isinstance(files_or_seqinfo, dict): + assert(isinstance(list(files_or_seqinfo.keys())[0], SeqInfo)) + dicoms = None + seqinfo = files_or_seqinfo + else: + dicoms = files_or_seqinfo + seqinfo = None + + if args.queue: + if seqinfo and not dicoms: + # flatten them all and provide into batching, which again + # would group them... heh + dicoms = sum(seqinfo.values(), []) + # so + raise NotImplementedError( + "we already groupped them so need to add a switch to avoid " + "any groupping, so no outdir prefix doubled etc" + ) + # TODO This needs to be updated to better scale with additional args + progname = os.path.abspath(inspect.getfile(inspect.currentframe())) + convertcmd = ' '.join(['python', progname, + '-o', study_outdir, + '-f', heuristic.filename, + '-s', sid, + '--anon-cmd', args.anon_cmd, + '-c', args.converter]) + if session: + convertcmd += " --ses '%s'" % session + if args.with_prov: + convertcmd += " --with-prov" + if args.bids: + convertcmd += " --bids" + convertcmd += ["'%s'" % f for f in dicoms] + + script_file = 'dicom-%s.sh' % sid + with open(script_file, 'wt') as fp: + fp.writelines(['#!/bin/bash\n', convertcmd]) + outcmd = 'sbatch -J dicom-%s -p %s -N1 -c2 --mem=20G %s' \ + % (sid, args.queue, script_file) + os.system(outcmd) + continue + + anon_sid = get_annonimized_sid(sid, args.anon_cmd) + + study_outdir = opj(outdir, locator or '') + + anon_outdir = args.conv_outdir or outdir + anon_study_outdir = opj(anon_outdir, locator or '') + + # TODO: --datalad cmdline option, which would take care about initiating + # the outdir -> study_outdir datasets if not yet there + if args.datalad: + datalad_msg_suf = ' %s' % anon_sid + if session: + datalad_msg_suf += ", session %s" % session + if seqinfo: + datalad_msg_suf += ", %d sequences" % len(seqinfo) + datalad_msg_suf += ", %d dicoms" % ( + len(sum(seqinfo.values(), [])) if seqinfo else len(dicoms) + ) + from datalad.api import Dataset + ds = Dataset(anon_study_outdir) + if not exists(anon_outdir) or not ds.is_installed(): + add_to_datalad( + anon_outdir, anon_study_outdir, + msg="Preparing for %s" % datalad_msg_suf, + bids=args.bids) + + convert_dicoms( + sid, + dicoms, + study_outdir, + heuristic=heuristic, + converter=args.converter, + anon_sid=anon_sid, + anon_outdir=anon_study_outdir, + with_prov=args.with_prov, + ses=session, + is_bids=args.bids, + seqinfo=seqinfo) + + if args.datalad: + msg = "Converted subject %s" % datalad_msg_suf + # TODO: whenever propagate to supers work -- do just + # ds.save(msg=msg) + # also in batch mode might fail since we have no locking ATM + # and theoretically no need actually to save entire study + # we just need that + add_to_datalad(outdir, study_outdir, msg=msg, bids=args.bids) + + # if args.bids: + # # Let's populate BIDS templates for folks to take care about + # for study_outdir in processed_studydirs: + # populate_bids_templates(study_outdir) + # + # # TODO: record_collection of the sid/session although that information + # # is pretty much present in .heudiconv/SUBJECT/info so we could just poke there + + tempdirs.cleanup() + + +def get_parser(): docstr = '\n'.join((__doc__, -""" - Example: + """ + Example: - heudiconv -d rawdata/%s -o . -f heuristic.py -s s1 s2 -s3 -""")) + heudiconv -d rawdata/%s -o . -f + heuristic.py -s s1 s2 + s3 + """)) parser = argparse.ArgumentParser(description=docstr) parser.add_argument('--version', action='version', version=__version__) parser.add_argument('-d', '--dicom_dir_template', @@ -1353,17 +1483,18 @@ s3 'heuristic') parser.add_argument('-c', '--converter', dest='converter', required=True, - choices=('mri_convert', 'dcmstack', 'dcm2nii', 'dcm2niix', - 'none'), - help='''tool to use for dicom conversion. Setting to + choices=( + 'mri_convert', 'dcmstack', 'dcm2nii', 'dcm2niix', + 'none'), + help='''tool to use for dicom conversion. Setting to "none" disables the actual conversion step -- useful for testing heuristics.''') - parser.add_argument('-o', '--outdir', dest='outputdir', + parser.add_argument('-o', '--outdir', dest='outdir', default=os.getcwd(), help='''output directory for conversion setup (for further customization and future reference. This directory will refer to non-anonymized subject IDs''') - parser.add_argument('-a', '--conv-outdir', dest='conv_outputdir', + parser.add_argument('-a', '--conv-outdir', dest='conv_outdir', default=None, help='''output directory for converted files. By default this is identical to --outdir. This option is @@ -1374,17 +1505,21 @@ s3 DICOMs to anonymmized IDs. Such command must take a single argument and return a single anonymized ID. Also see --conv-outdir''') - parser.add_argument('-f', '--heuristic', dest='heuristic_file', required=True, + parser.add_argument('-f', '--heuristic', dest='heuristic_file', + required=True, help='python script containing heuristic') parser.add_argument('-q', '--queue', dest='queue', default=None, help='''select batch system to submit jobs to instead of running the conversion serially''') parser.add_argument('-p', '--with-prov', dest='with_prov', action='store_true', - help='''Store additional provenance information. Requires python-rdflib.''') + help='''Store additional provenance information. + Requires python-rdflib.''') parser.add_argument('-ss', '--ses', dest='session', default=None, help='''session for longitudinal study_sessions, default is none''') parser.add_argument('-b', '--bids', dest='bids', action='store_true', help='''flag for output into BIDS structure''') + parser.add_argument('--overwrite', dest='overwrite', action='store_true', + help='''flag to allow overwrite existing files''') parser.add_argument('--datalad', dest='datalad', action='store_true', help='''Store the entire collection as DataLad dataset(s). Small files will be committed directly to git, while large to annex. @@ -1393,15 +1528,20 @@ s3 (i.e. no symlinks to under .git/annex). For now just for BIDS mode.''') parser.add_argument('--dbg', action='store_true', dest='debug', help="do not catch exceptions and show exception traceback") - parser.add_argument( 'files', nargs='*', help="files (tarballs, dicoms) or directories containing files to " "process. Specify one of the --dicom_dir_template or files " "not both") + return parser - args = parser.parse_args(args) + +def main(argv=None): + """Given a list of command line arguments, parse them and pass into _main + """ + parser = get_parser() + args = parser.parse_args(argv) # TODO: deprecate dicom_dir_template in favor of --files-templated or # smth like that which could take {subject} {session} ... and process @@ -1413,90 +1553,14 @@ s3 if args.debug: setup_exceptionhook() - # - # Load heuristic -- better do it asap to make sure it loads correctly - # - heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) - - # - # Deal with provided files or templates - # - - # - # pre-process provided list of files and possibly sort into groups/sessions - # - - # for now will be just - # Group files per each study/subject/session - - dicom_dir_template = args.dicom_dir_template - files_opt = args.files - session = args.session - subjs = args.subjs - outputdir = os.path.abspath(args.outputdir) - - # TODO: Move into a function! - study_sessions = get_study_sessions(dicom_dir_template, files_opt, - heuristic, outputdir, session, subjs)# extract tarballs, and replace their entries with expanded lists of files - # TODO: we might need to sort so sessions are ordered??? - lgr.info("Need to process %d study sessions", len(study_sessions)) - - processed_studydirs = set() - - for (locator, session, subject), files_or_seqinfo in study_sessions.items(): - - if not len(files_or_seqinfo): - raise ValueError("nothing to process?") - # that is how life is ATM :-/ since we don't do sorting if subj - # template is provided - if isinstance(files_or_seqinfo, dict): - assert(isinstance(list(files_or_seqinfo.keys())[0], SeqInfo)) - files = None - seqinfo = files_or_seqinfo - else: - files = files_or_seqinfo - seqinfo = None - - study_outputdir = opj(outputdir, locator or '') - - # TODO: --datalad cmdline option, which would take care about initiating - # the outputdir -> study_outputdir datasets if not yet there - if args.datalad: - add_to_datalad(outputdir, study_outputdir, bids=args.bids) - - convert_dicoms( - subject, - files, - study_outputdir, - heuristic=heuristic, - converter=args.converter, - queue=args.queue, - anon_sid_cmd=args.anon_cmd, - anon_outdir=args.conv_outputdir, - with_prov=args.with_prov, - ses=session, - is_bids=args.bids, - seqinfo=seqinfo) - - if args.bids and seqinfo: - add_participant_record(study_outputdir, - subject, - seqinfo.keys()[0].patient_age, - seqinfo.keys()[0].patient_sex, - ) - processed_studydirs.add(study_outputdir) - - if args.bids: - # Let's populate BIDS templates for folks to take care about - for study_outputdir in processed_studydirs: - populate_bids_templates(study_outputdir) - if args.datalad: - add_to_datalad(outputdir, study_outputdir, bids=args.bids) - - # TODO: record_collection of the subject/session although that information - # is pretty much present in .heudiconv/SUBJECT/info so we could just poke there - - tempdirs.cleanup() + orig_global_options = global_options.copy() + try: + global_options['overwrite'] = args.overwrite + return _main(args) + finally: + # reset back + for k, v in orig_global_options.items(): + global_options[k] = v if __name__ == '__main__': diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 5ff6f979..23edec14 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -202,7 +202,7 @@ def get_unique(seqinfos, attr): # TODO: might need to do groupping per each session and return here multiple # hits, or may be we could just somehow demarkate that it will be multisession # one and so then later value parsed (again) in infotodict would be used??? -def infotoids(seqinfos, outputdir): +def infotoids(seqinfos, outdir): # decide on subjid and session based on patient_id lgr.info("Processing sequence infos to deduce study/session") study_description = get_unique(seqinfos, 'study_description') @@ -218,7 +218,7 @@ def infotoids(seqinfos, outputdir): # TODO: actually check if given study is study we would care about # and if not -- we should throw some ???? exception - # So -- use `outputdir` and locator etc to see if for a given locator/subject + # So -- use `outdir` and locator etc to see if for a given locator/subject # and possible ses+ in the sequence names, so we would provide a sequence # So might need to go through parse_dbic_protocol_name(s.protocol_name) # to figure out presence of sessions. @@ -250,7 +250,7 @@ def infotoids(seqinfos, outputdir): # TODO - I think we are doomed to go through the sequence and split # ... actually the same as with nonsign_vals, we just would need to figure # out initial one if sign ones, and should make use of knowing - # outputdir + # outdir #raise NotImplementedError() # Let's be lazy for now just to get somewhere session = '001' From dc88b2c91e8793a01f4b441e6d0173de5983bfff Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 20 Oct 2016 22:14:00 -0400 Subject: [PATCH 026/181] RF: just removed if True block and dedented --- bin/heudiconv | 208 +++++++++++++++++++++++++------------------------- 1 file changed, 103 insertions(+), 105 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index fce9f99c..95b46224 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -849,7 +849,6 @@ def embed_metadata_into_nifti(converter, is_bids, item_dicoms, outname, os.chdir(cwd) - def convert_dicoms(sid, dicoms, outdir, @@ -860,111 +859,110 @@ def convert_dicoms(sid, ses=None, is_bids=False, seqinfo=None): - if True: # just to minimize diff for now, remove later and dedent + if dicoms: + lgr.info("Processing %d dicoms", len(dicoms)) + elif seqinfo: + lgr.info("Processing %d pre-sorted seqinfo entries", len(seqinfo)) + else: + raise ValueError("neither dicoms nor seqinfo dict was provided") + + # in this reimplementation we can have only a single session assigned + # at this point + # dcmsessions = + + # + # Annonimization parameters + # + if anon_sid is None: + anon_sid = sid + if anon_outdir is None: + anon_outdir = outdir + + # Figure out where to stick supplemental info dicoms + idir = os.path.join(outdir, '.heudiconv', sid) + if is_bids and ses: + idir = os.path.join(idir, 'ses-%s' % str(ses)) + # yoh: in my case if idir exists, it means that that study/subject/session + # is already processed + if anon_outdir == outdir: + # if all goes into a single dir, have a dedicated 'info' subdir + idir = os.path.join(idir, 'info') + if not os.path.exists(idir): + os.makedirs(idir) + + shutil.copy(heuristic.filename, idir) + ses_suffix = "_ses-%s" % ses if ses is not None else "" + info_file = os.path.join(idir, '%s%s.auto.txt' % (sid, ses_suffix)) + edit_file = os.path.join(idir, '%s%s.edit.txt' % (sid, ses_suffix)) + filegroup_file = os.path.join(idir, 'filegroup%s.json' % ses_suffix) + + if os.path.exists(edit_file): # XXX may be condition on seqinfo is None + lgr.info("Reloading existing filegroup.json because %s exists", + edit_file) + info = read_config(edit_file) + filegroup = load_json(filegroup_file) + # XXX Yarik finally understood why basedir was dragged along! + # So we could reuse the same PATHs definitions possibly consistent + # across re-runs... BUT that wouldn't work anyways if e.g. + # DICOMs dumped with SOP UUIDs thus differing across runs etc + # So either it would need to be brought back or reconsidered altogether + # (since no sample data to test on etc) + else: + # TODO -- might have been done outside already! if dicoms: - lgr.info("Processing %d dicoms", len(dicoms)) - elif seqinfo: - lgr.info("Processing %d pre-sorted seqinfo entries", len(seqinfo)) - else: - raise ValueError("neither dicoms nor seqinfo dict was provided") - - # in this reimplementation we can have only a single session assigned - # at this point - # dcmsessions = - - # - # Annonimization parameters - # - if anon_sid is None: - anon_sid = sid - if anon_outdir is None: - anon_outdir = outdir - - # Figure out where to stick supplemental info dicoms - idir = os.path.join(outdir, '.heudiconv', sid) - if is_bids and ses: - idir = os.path.join(idir, 'ses-%s' % str(ses)) - # yoh: in my case if idir exists, it means that that study/subject/session - # is already processed - if anon_outdir == outdir: - # if all goes into a single dir, have a dedicated 'info' subdir - idir = os.path.join(idir, 'info') - if not os.path.exists(idir): - os.makedirs(idir) - - shutil.copy(heuristic.filename, idir) - ses_suffix = "_ses-%s" % ses if ses is not None else "" - info_file = os.path.join(idir, '%s%s.auto.txt' % (sid, ses_suffix)) - edit_file = os.path.join(idir, '%s%s.edit.txt' % (sid, ses_suffix)) - filegroup_file = os.path.join(idir, 'filegroup%s.json' % ses_suffix) - - if os.path.exists(edit_file): # XXX may be condition on seqinfo is None - lgr.info("Reloading existing filegroup.json because %s exists", - edit_file) - info = read_config(edit_file) - filegroup = load_json(filegroup_file) - # XXX Yarik finally understood why basedir was dragged along! - # So we could reuse the same PATHs definitions possibly consistent - # across re-runs... BUT that wouldn't work anyways if e.g. - # DICOMs dumped with SOP UUIDs thus differing across runs etc - # So either it would need to be brought back or reconsidered altogether - # (since no sample data to test on etc) - else: - # TODO -- might have been done outside already! - if dicoms: - seqinfo = group_dicoms_into_seqinfos( - dicoms, - dcmfilter=getattr(heuristic, 'filter_dicom', None)) - seqinfo_list = list(seqinfo.keys()) - filegroup = {si.series_id: x for si, x in seqinfo.items()} - - save_json(filegroup_file, filegroup) - dicominfo_file = os.path.join(idir, 'dicominfo%s.tsv' % ses_suffix) - with open(dicominfo_file, 'wt') as fp: - for seq in seqinfo_list: - fp.write('\t'.join([str(val) for val in seq]) + '\n') - lgr.debug("Calling out to %s.infodict", heuristic) - info = heuristic.infotodict(seqinfo_list) - write_config(info_file, info) - write_config(edit_file, info) - - # - # Conversion - # - - sourcedir = None - if is_bids: - sourcedir = os.path.join(outdir, 'sourcedata') - # the other portion of the path would mimic BIDS layout - # so we don't need to worry here about sub, ses at all - tdir = anon_outdir - else: - tdir = os.path.join(anon_outdir, anon_sid) - - if converter != 'none': - lgr.info("Doing conversion using %s", converter) - cinfo = conversion_info(anon_sid, tdir, info, filegroup, - ses=ses) - convert(cinfo, - converter=converter, - scaninfo_suffix=getattr( - heuristic, 'scaninfo_suffix', '.json'), - custom_callable=getattr( - heuristic, 'custom_callable', None), - with_prov=with_prov, - is_bids=is_bids, - sourcedir=sourcedir, - outdir=tdir) - - if is_bids: - if seqinfo: - add_participant_record( - anon_outdir, - anon_sid, - seqinfo.keys()[0].patient_age, - seqinfo.keys()[0].patient_sex, - ) - populate_bids_templates(anon_outdir) + seqinfo = group_dicoms_into_seqinfos( + dicoms, + dcmfilter=getattr(heuristic, 'filter_dicom', None)) + seqinfo_list = list(seqinfo.keys()) + filegroup = {si.series_id: x for si, x in seqinfo.items()} + + save_json(filegroup_file, filegroup) + dicominfo_file = os.path.join(idir, 'dicominfo%s.tsv' % ses_suffix) + with open(dicominfo_file, 'wt') as fp: + for seq in seqinfo_list: + fp.write('\t'.join([str(val) for val in seq]) + '\n') + lgr.debug("Calling out to %s.infodict", heuristic) + info = heuristic.infotodict(seqinfo_list) + write_config(info_file, info) + write_config(edit_file, info) + + # + # Conversion + # + + sourcedir = None + if is_bids: + sourcedir = os.path.join(outdir, 'sourcedata') + # the other portion of the path would mimic BIDS layout + # so we don't need to worry here about sub, ses at all + tdir = anon_outdir + else: + tdir = os.path.join(anon_outdir, anon_sid) + + if converter != 'none': + lgr.info("Doing conversion using %s", converter) + cinfo = conversion_info(anon_sid, tdir, info, filegroup, + ses=ses) + convert(cinfo, + converter=converter, + scaninfo_suffix=getattr( + heuristic, 'scaninfo_suffix', '.json'), + custom_callable=getattr( + heuristic, 'custom_callable', None), + with_prov=with_prov, + is_bids=is_bids, + sourcedir=sourcedir, + outdir=tdir) + + if is_bids: + if seqinfo: + add_participant_record( + anon_outdir, + anon_sid, + seqinfo.keys()[0].patient_age, + seqinfo.keys()[0].patient_sex, + ) + populate_bids_templates(anon_outdir) def get_annonimized_sid(sid, anon_sid_cmd): From b9f54b857b9feae7a734e101c36a40be3c0d77b0 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 20 Oct 2016 22:17:43 -0400 Subject: [PATCH 027/181] BF: now we do need all shebang to run tests (nipype, nibabel, dcmstack, etc) --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index dcae7c32..fffe8c82 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,8 +23,8 @@ before_install: # The ultimate one-liner setup for NeuroDebian repository - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - travis_retry sudo apt-get update -qq - - travis_retry sudo apt-get install git-annex-standalone - - pip install datalad + - travis_retry sudo apt-get install git-annex-standalone python-dicom python-nipype python-nibabel + - pip install datalad dcmstack install: - git config --global user.email "test@travis.land" From 6719c97a9e3d6b5641e307632a6e416d2249a0ee Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 21 Oct 2016 22:36:45 -0400 Subject: [PATCH 028/181] dcmstack is not yet on pypi --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index fffe8c82..b27729c2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,7 @@ before_install: - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - travis_retry sudo apt-get update -qq - travis_retry sudo apt-get install git-annex-standalone python-dicom python-nipype python-nibabel - - pip install datalad dcmstack + - pip install datalad git+git://github.com/moloney/dcmstack/ install: - git config --global user.email "test@travis.land" From bcb473b9119ac99684da2181f8b0b5bc922bc9ad Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 31 Oct 2016 12:28:13 -0400 Subject: [PATCH 029/181] BF: pip install nipype for testing --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index b27729c2..496a8b25 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,6 +25,8 @@ before_install: - travis_retry sudo apt-get update -qq - travis_retry sudo apt-get install git-annex-standalone python-dicom python-nipype python-nibabel - pip install datalad git+git://github.com/moloney/dcmstack/ + # there is only dated nipype for precise from neurodebian + - pip install nipype install: - git config --global user.email "test@travis.land" From 4ed95db35f02c5ae31c9d96b98f0ebdf1855fa0d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 31 Oct 2016 12:58:59 -0400 Subject: [PATCH 030/181] and configparser --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 496a8b25..23d996bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,8 +25,9 @@ before_install: - travis_retry sudo apt-get update -qq - travis_retry sudo apt-get install git-annex-standalone python-dicom python-nipype python-nibabel - pip install datalad git+git://github.com/moloney/dcmstack/ - # there is only dated nipype for precise from neurodebian - - pip install nipype + # there is only dated nipype for precise from neurodebian. + # TEMP: configparser needs to be listed manually for now: https://github.com/nipy/nipype/pull/1697 + - pip install nipype configparser install: - git config --global user.email "test@travis.land" From 223c3c86ba189f4aac0073af43e568f6d8dd3e24 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 31 Oct 2016 13:15:59 -0400 Subject: [PATCH 031/181] added forgotten dcm2niix --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 23d996bd..e2ee52d4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ before_install: # The ultimate one-liner setup for NeuroDebian repository - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - travis_retry sudo apt-get update -qq - - travis_retry sudo apt-get install git-annex-standalone python-dicom python-nipype python-nibabel + - travis_retry sudo apt-get install git-annex-standalone python-dicom python-nipype python-nibabel dcm2niix - pip install datalad git+git://github.com/moloney/dcmstack/ # there is only dated nipype for precise from neurodebian. # TEMP: configparser needs to be listed manually for now: https://github.com/nipy/nipype/pull/1697 From a4c40cefbba642fb11b315355f60c0493f17b7f0 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Mon, 31 Oct 2016 21:01:31 -0400 Subject: [PATCH 032/181] Sanitize labels, always have sid lowercase --- heuristics/dbic_bids.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 23edec14..20fcbf8e 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -264,6 +264,11 @@ def infotoids(seqinfos, outdir): } +def sanitize_str(value): + """Remove illegal characters for BIDS from task/acq/etc..""" + return value.translate(None, '#!@$%^&.,:;') + + def parse_dbic_protocol_name(protocol_name): """Parse protocol name """ @@ -320,7 +325,7 @@ def split2(s): if key in ['ses', 'run', 'task', 'acq']: # those we care about explicitly - regd[{'ses': 'session'}.get(key, key)] = value + regd[{'ses': 'session'}.get(key, key)] = sanitize_str(value) else: bids_leftovers.append(s) @@ -342,6 +347,8 @@ def split2(s): def fixup_subjectid(subjectid): """Just in case someone managed to miss a zero or added an extra one""" + # make it lowercase + subjectid = subjectid.lower() reg = re.match("sid0*(\d+)$", subjectid) if not reg: # some completely other pattern @@ -349,6 +356,12 @@ def fixup_subjectid(subjectid): return "sid%06d" % int(reg.groups()[0]) +def test_sanitize_str(): + assert sanitize_str('acq-super@duper.faster') == 'acq-superduperfaster' + assert sanitize_str('acq-perfect') == 'acq-perfect' + assert sanitize_str('acq-never:use:colon:!') == 'acq-neverusecolon' + + def test_fixupsubjectid(): assert fixup_subjectid("abra") == "abra" assert fixup_subjectid("sub") == "sub" @@ -357,6 +370,7 @@ def test_fixupsubjectid(): assert fixup_subjectid("sid0000030") == "sid000030" assert fixup_subjectid("sid00030") == "sid000030" assert fixup_subjectid("sid30") == "sid000030" + assert fixup_subjectid("SID30") == "sid000030" def test_parse_dbic_protocol_name(): From e99c97dc9481b6c521d9b7c7ceb93dafc27ea780 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 31 Oct 2016 23:41:39 -0400 Subject: [PATCH 033/181] For now disable py3 testing --- .travis.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index e2ee52d4..b2169320 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,12 @@ language: python python: - 2.7 - - 3.3 - - 3.4 - - 3.5 +# For PY3 testing we should use probably conda, so TODO +# since otherwise scipy fails to build ATM etc, see e.g. +# https://travis-ci.org/nipy/heudiconv/jobs/172172847 +# - 3.3 +# - 3.4 +# - 3.5 cache: - apt From 0aba7018fe576015e8d8650cecf4325cc9eea17c Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Tue, 1 Nov 2016 21:26:32 -0400 Subject: [PATCH 034/181] Add some initial hacks for existing protocols --- heuristics/dbic_bids.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 20fcbf8e..304e1200 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -20,6 +20,33 @@ def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), return template, outtype, annotation_classes +# XXX: hackhackhack +protocols2fix = ['dbic^pulse_sequences'] + + +def fix_dbic_protocol(seqinfo): + """Ad-hoc fixup for existing protocols""" + + # get name of the study to check if we know how to fix it up + study_descr = get_unique(seqinfo, 'study_description') + + # need to replace both protocol_name and series_id + keys2replace = ['protocol_name', 'series_id'] + if study_descr == 'dbic^pulse_sequences': + replace = [('anat_', 'anat-'), + ('life[0-9]', 'life')] + for s in seqinfo: + for substring, replacement in replace: + regex = re.compile(substring, re.IGNORECASE) + for key in keys2replace: + new_value = regex.sub(replacement, getattr(s, key)) + setattr(s, key, new_value) + else: + raise ValueError("I don't know how to fix {0}".format(study_descr)) + + return seqinfo + + # XXX we killed session indicator! what should we do now?!!! # WE DON:T NEED IT -- it will be provided into conversion_info as `session` # So we just need subdir and file_suffix! @@ -34,6 +61,12 @@ def infotodict(seqinfo): subindex: sub index within group session: scan index for longitudinal acq """ + # XXX: ad hoc hack + study_description = get_unique(seqinfo, 'study_description') + if study_description in protocols2fix: + lgr.info("Fixing up protocol for {0}".format(study_description)) + seqinfo = fix_dbic_protocol(seqinfo) + lgr.info("Processing %d seqinfo entries", len(seqinfo)) and_dicom = ('dicom', 'nii.gz') From 45393d14defe413bd99afd1267ce5e4ebc871739 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Tue, 1 Nov 2016 21:40:40 -0400 Subject: [PATCH 035/181] Namedtuples are immutable --- heuristics/dbic_bids.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 304e1200..01318715 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -35,12 +35,17 @@ def fix_dbic_protocol(seqinfo): if study_descr == 'dbic^pulse_sequences': replace = [('anat_', 'anat-'), ('life[0-9]', 'life')] - for s in seqinfo: - for substring, replacement in replace: - regex = re.compile(substring, re.IGNORECASE) - for key in keys2replace: - new_value = regex.sub(replacement, getattr(s, key)) - setattr(s, key, new_value) + for i, s in enumerate(seqinfo): + fixed_kwargs = dict() + for key in keys2replace: + value = getattr(s, key) + # replace all I need to replace + for substring, replacement in replace: + value = re.sub(substring, replacement, value) + fixed_kwargs[key] = value + # namedtuples are immutable + seqinfo[i] = s._replace(**fixed_kwargs) + else: raise ValueError("I don't know how to fix {0}".format(study_descr)) From b4d77a611347d0a980d2ba22fd3fa08fea71b9ae Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Tue, 1 Nov 2016 21:45:24 -0400 Subject: [PATCH 036/181] Add forgotten key and rename better --- heuristics/dbic_bids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 01318715..9e8cab35 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -31,10 +31,10 @@ def fix_dbic_protocol(seqinfo): study_descr = get_unique(seqinfo, 'study_description') # need to replace both protocol_name and series_id - keys2replace = ['protocol_name', 'series_id'] + keys2replace = ['protocol_name', 'series_id', 'series_description'] if study_descr == 'dbic^pulse_sequences': replace = [('anat_', 'anat-'), - ('life[0-9]', 'life')] + ('run-life[0-9]', 'run+_task-life')] for i, s in enumerate(seqinfo): fixed_kwargs = dict() for key in keys2replace: From 1da83fb2dd3e1a07267c26b38661fbe00e437ccc Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Tue, 1 Nov 2016 21:50:17 -0400 Subject: [PATCH 037/181] Do not change series_id otherwise heudiconv doesn't know what to do --- heuristics/dbic_bids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 9e8cab35..bb2aa0f2 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -30,8 +30,8 @@ def fix_dbic_protocol(seqinfo): # get name of the study to check if we know how to fix it up study_descr = get_unique(seqinfo, 'study_description') - # need to replace both protocol_name and series_id - keys2replace = ['protocol_name', 'series_id', 'series_description'] + # need to replace both protocol_name series_description + keys2replace = ['protocol_name', 'series_description'] if study_descr == 'dbic^pulse_sequences': replace = [('anat_', 'anat-'), ('run-life[0-9]', 'run+_task-life')] From 5b3ad9809c2730f521b737edadde91c284d1ebdc Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Wed, 2 Nov 2016 08:52:55 -0400 Subject: [PATCH 038/181] Refactor fix_dbic_protocol, add tests --- heuristics/dbic_bids.py | 76 ++++++++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 19 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index bb2aa0f2..15893942 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -21,33 +21,34 @@ def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), # XXX: hackhackhack -protocols2fix = ['dbic^pulse_sequences'] +protocols2fix = { + 'dbic^pulse_sequences': [('anat_', 'anat-'), + ('run-life[0-9]', 'run+_task-life'), + ('scout_run\+', 'scout')] +} +keys2replace = ['protocol_name', 'series_description'] -def fix_dbic_protocol(seqinfo): +def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): """Ad-hoc fixup for existing protocols""" # get name of the study to check if we know how to fix it up study_descr = get_unique(seqinfo, 'study_description') - # need to replace both protocol_name series_description - keys2replace = ['protocol_name', 'series_description'] - if study_descr == 'dbic^pulse_sequences': - replace = [('anat_', 'anat-'), - ('run-life[0-9]', 'run+_task-life')] - for i, s in enumerate(seqinfo): - fixed_kwargs = dict() - for key in keys2replace: - value = getattr(s, key) - # replace all I need to replace - for substring, replacement in replace: - value = re.sub(substring, replacement, value) - fixed_kwargs[key] = value - # namedtuples are immutable - seqinfo[i] = s._replace(**fixed_kwargs) - - else: + if study_descr not in subsdict: raise ValueError("I don't know how to fix {0}".format(study_descr)) + # need to replace both protocol_name series_description + substitutions = subsdict[study_descr] + for i, s in enumerate(seqinfo): + fixed_kwargs = dict() + for key in keys: + value = getattr(s, key) + # replace all I need to replace + for substring, replacement in substitutions: + value = re.sub(substring, replacement, value) + fixed_kwargs[key] = value + # namedtuples are immutable + seqinfo[i] = s._replace(**fixed_kwargs) return seqinfo @@ -394,6 +395,43 @@ def fixup_subjectid(subjectid): return "sid%06d" % int(reg.groups()[0]) +def test_fix_dbic_protocol(): + from collections import namedtuple + FakeSeqInfo = namedtuple('FakeSeqInfo', + ['study_description', 'field1', 'field2']) + + seq1 = FakeSeqInfo('mystudy', + '02-anat-scout_run+_MPR_sag', + '11-func_run-life2_acq-2mm692') + seq2 = FakeSeqInfo('mystudy', + 'nochangeplease', + 'nochangeeither') + + + seqinfos = [seq1, seq2] + keys = ['field1'] + subsdict = { + 'mystudy': [('scout_run\+', 'scout'), + ('run-life[0-9]', 'run+_task-life')], + } + + seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict) + assert(seqinfos[1] == seqinfos_[1]) + # field2 shouldn't have changed since I didn't pass it + assert(seqinfos_[0] == FakeSeqInfo('mystudy', + '02-anat-scout_MPR_sag', + seq1.field2)) + + # change also field2 please + keys = ['field1', 'field2'] + seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict) + assert(seqinfos[1] == seqinfos_[1]) + # now everything should have changed + assert(seqinfos_[0] == FakeSeqInfo('mystudy', + '02-anat-scout_MPR_sag', + '11-func_run+_task-life_acq-2mm692')) + + def test_sanitize_str(): assert sanitize_str('acq-super@duper.faster') == 'acq-superduperfaster' assert sanitize_str('acq-perfect') == 'acq-perfect' From b7864a9aea97e3e90895b4114dfae3247f0c9033 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Wed, 2 Nov 2016 09:23:54 -0400 Subject: [PATCH 039/181] Use md5 hash to store substitutions --- heuristics/dbic_bids.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 15893942..dad05111 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -1,6 +1,7 @@ import os import re from collections import defaultdict +import hashlib import logging lgr = logging.getLogger('heudiconv') @@ -20,11 +21,20 @@ def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), return template, outtype, annotation_classes +def md5sum(string): + """Computes md5sum of as string""" + m = hashlib.md5(string.encode()) + return m.hexdigest() + + # XXX: hackhackhack protocols2fix = { - 'dbic^pulse_sequences': [('anat_', 'anat-'), - ('run-life[0-9]', 'run+_task-life'), - ('scout_run\+', 'scout')] + '9d148e2a05f782273f6343507733309d': + [('anat_', 'anat-'), + ('run-life[0-9]', 'run+_task-life'), + ('scout_run\+', 'scout')], + '76b36c80231b0afaf509e2d52046e964': + [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')] } keys2replace = ['protocol_name', 'series_description'] @@ -34,11 +44,12 @@ def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): # get name of the study to check if we know how to fix it up study_descr = get_unique(seqinfo, 'study_description') + study_descr_hash = md5sum(study_descr) - if study_descr not in subsdict: + if study_descr_hash not in subsdict: raise ValueError("I don't know how to fix {0}".format(study_descr)) # need to replace both protocol_name series_description - substitutions = subsdict[study_descr] + substitutions = subsdict[study_descr_hash] for i, s in enumerate(seqinfo): fixed_kwargs = dict() for key in keys: @@ -395,6 +406,11 @@ def fixup_subjectid(subjectid): return "sid%06d" % int(reg.groups()[0]) +def test_md5sum(): + assert md5sum('cryptonomicon') == '1cd52edfa41af887e14ae71d1db96ad1' + assert md5sum('mysecretmessage') == '07989808231a0c6f522f9d8e34695794' + + def test_fix_dbic_protocol(): from collections import namedtuple FakeSeqInfo = namedtuple('FakeSeqInfo', @@ -411,8 +427,9 @@ def test_fix_dbic_protocol(): seqinfos = [seq1, seq2] keys = ['field1'] subsdict = { - 'mystudy': [('scout_run\+', 'scout'), - ('run-life[0-9]', 'run+_task-life')], + md5sum('mystudy'): + [('scout_run\+', 'scout'), + ('run-life[0-9]', 'run+_task-life')], } seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict) From 3bad1f09422d66e4e57866de39b748f340606901 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Wed, 2 Nov 2016 09:36:16 -0400 Subject: [PATCH 040/181] Forgot to hash in infotodict --- heuristics/dbic_bids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index dad05111..894aefc6 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -80,7 +80,7 @@ def infotodict(seqinfo): """ # XXX: ad hoc hack study_description = get_unique(seqinfo, 'study_description') - if study_description in protocols2fix: + if md5sum(study_description) in protocols2fix: lgr.info("Fixing up protocol for {0}".format(study_description)) seqinfo = fix_dbic_protocol(seqinfo) From eb6f0aa9f1ea5d7823f1d9333f98051de246a29e Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Wed, 2 Nov 2016 12:22:49 -0400 Subject: [PATCH 041/181] Allow split of PI-student with dash as well --- heuristics/dbic_bids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 894aefc6..4c825077 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -259,8 +259,8 @@ def infotoids(seqinfos, outdir): subject = fixup_subjectid(get_unique(seqinfos, 'patient_id')) # TODO: fix up subject id if missing some 0s split = study_description.split('^', 1) - # split first one even more, since couldbe PI_Student - split = split[0].split('_', 1) + split[1:] + # split first one even more, since couldbe PI_Student or PI-Student + split = re.split(split[0], '-|_', 1) + split[1:] # locator = study_description.replace('^', '/') locator = '/'.join(split) From 1a5df6225d695bd9f2d65e2fd0bb455a51497781 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Wed, 2 Nov 2016 12:25:07 -0400 Subject: [PATCH 042/181] BF: pattern comes first --- heuristics/dbic_bids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 4c825077..26f6db6c 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -260,7 +260,7 @@ def infotoids(seqinfos, outdir): # TODO: fix up subject id if missing some 0s split = study_description.split('^', 1) # split first one even more, since couldbe PI_Student or PI-Student - split = re.split(split[0], '-|_', 1) + split[1:] + split = re.split('-|_', split[0], 1) + split[1:] # locator = study_description.replace('^', '/') locator = '/'.join(split) From a41ca4c5994558558f47f7f1aa5bc10c33891f4c Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 09:02:31 -0400 Subject: [PATCH 043/181] Add fix for canceled runs --- heuristics/dbic_bids.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 26f6db6c..d8bcc382 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -27,7 +27,30 @@ def md5sum(string): return m.hexdigest() -# XXX: hackhackhack +def fix_canceled_runs(seqinfo): + """Function that adds cancelme_ to known bad runs which were forgotten""" + # dictionary from accession-number to runs that need to be erased + accession2run = { + 'A000067': ['^09-'] + } + accession_number = get_unique(seqinfo, 'accession_number') + if accession_number in accession2run: + badruns = accession2run[accession_number] + badruns_pattern = '|'.join(badruns) + for i, s in enumerate(seqinfo): + match = re.match(badruns_pattern, s.series_id) + if match: + lgr.info('Fixing bad run {0}'.format(s.series_id)) + fixedkwargs = dict() + for key in keys2replace: + fixedkwargs[key] = 'cancelme_' + getattr(s, key) + seqinfo[i] = s._replace(**fixedkwargs) + return seqinfo + + +# dictionary containing fixes, keys are md5sum of study_description from +# dicoms, in the form of PI^Experimenter; values are list of tuples in the form +# (regex_pattern, substitution) protocols2fix = { '9d148e2a05f782273f6343507733309d': [('anat_', 'anat-'), @@ -41,6 +64,8 @@ def md5sum(string): def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): """Ad-hoc fixup for existing protocols""" + # add cancelme to known bad runs + seqinfo = fix_canceled_runs(seqinfo) # get name of the study to check if we know how to fix it up study_descr = get_unique(seqinfo, 'study_description') From 44503a692f39ae69bde827922b651ebc0e226a47 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 09:33:27 -0400 Subject: [PATCH 044/181] Add tests --- heuristics/dbic_bids.py | 61 +++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index d8bcc382..ead089a6 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -27,12 +27,14 @@ def md5sum(string): return m.hexdigest() -def fix_canceled_runs(seqinfo): +# dictionary from accession-number to runs that need to be marked as bad +fix_accession2run = { + 'A000067': ['^09-'] +} + + +def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): """Function that adds cancelme_ to known bad runs which were forgotten""" - # dictionary from accession-number to runs that need to be erased - accession2run = { - 'A000067': ['^09-'] - } accession_number = get_unique(seqinfo, 'accession_number') if accession_number in accession2run: badruns = accession2run[accession_number] @@ -436,15 +438,50 @@ def test_md5sum(): assert md5sum('mysecretmessage') == '07989808231a0c6f522f9d8e34695794' -def test_fix_dbic_protocol(): +def test_fix_canceled_runs(): from collections import namedtuple FakeSeqInfo = namedtuple('FakeSeqInfo', - ['study_description', 'field1', 'field2']) + ['accession_number', 'series_id', + 'protocol_name', 'series_description']) + + seqinfo = [] + runname = 'func_run+' + for i in range(1, 6): + seqinfo.append( + FakeSeqInfo('accession1', + '{0:02d}-'.format(i) + runname, + runname, runname) + ) + + fake_accession2run = { + 'accession1': ['^01-', '^03-'] + } + + seqinfo_ = fix_canceled_runs(seqinfo, fake_accession2run) - seq1 = FakeSeqInfo('mystudy', + for i, s in enumerate(seqinfo_, 1): + output = runname + if i == 1 or i == 3: + output = 'cancelme_' + output + for key in ['series_description', 'protocol_name']: + value = getattr(s, key) + assert(value == output) + # check we didn't touch series_id + assert(s.series_id == '{0:02d}-'.format(i) + runname) + + +def test_fix_dbic_protocol(): + from collections import namedtuple + FakeSeqInfo = namedtuple('FakeSeqInfo', + ['accession_number', 'study_description', + 'field1', 'field2']) + accession_number = 'A003' + seq1 = FakeSeqInfo(accession_number, + 'mystudy', '02-anat-scout_run+_MPR_sag', '11-func_run-life2_acq-2mm692') - seq2 = FakeSeqInfo('mystudy', + seq2 = FakeSeqInfo(accession_number, + 'mystudy', 'nochangeplease', 'nochangeeither') @@ -460,7 +497,8 @@ def test_fix_dbic_protocol(): seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict) assert(seqinfos[1] == seqinfos_[1]) # field2 shouldn't have changed since I didn't pass it - assert(seqinfos_[0] == FakeSeqInfo('mystudy', + assert(seqinfos_[0] == FakeSeqInfo(accession_number, + 'mystudy', '02-anat-scout_MPR_sag', seq1.field2)) @@ -469,7 +507,8 @@ def test_fix_dbic_protocol(): seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict) assert(seqinfos[1] == seqinfos_[1]) # now everything should have changed - assert(seqinfos_[0] == FakeSeqInfo('mystudy', + assert(seqinfos_[0] == FakeSeqInfo(accession_number, + 'mystudy', '02-anat-scout_MPR_sag', '11-func_run+_task-life_acq-2mm692')) From 8079d747b42eb98803b9dc5428eec9d4732f0b4f Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 09:40:36 -0400 Subject: [PATCH 045/181] Allow zero at the beginning or not --- heuristics/dbic_bids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index ead089a6..9e569408 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -29,7 +29,7 @@ def md5sum(string): # dictionary from accession-number to runs that need to be marked as bad fix_accession2run = { - 'A000067': ['^09-'] + 'A000067': ['^.9-'] } From 2ac967acd3ec2077fb127e29280f024f1a3adaef Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 09:42:52 -0400 Subject: [PATCH 046/181] BF: regex could match too many different runs --- heuristics/dbic_bids.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 9e569408..71f67348 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -28,8 +28,10 @@ def md5sum(string): # dictionary from accession-number to runs that need to be marked as bad +# NOTE: even if filename has number that is 0-padded, internally no padding +# is done fix_accession2run = { - 'A000067': ['^.9-'] + 'A000067': ['^9-'] } From f3dd773904f03ec937ae9f5edc25943a50205c93 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 09:49:19 -0400 Subject: [PATCH 047/181] Fix comment --- heuristics/dbic_bids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 71f67348..10e1ad26 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -53,8 +53,8 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): # dictionary containing fixes, keys are md5sum of study_description from -# dicoms, in the form of PI^Experimenter; values are list of tuples in the form -# (regex_pattern, substitution) +# dicoms, in the form of PI-Experimenter^protocolname +# values are list of tuples in the form (regex_pattern, substitution) protocols2fix = { '9d148e2a05f782273f6343507733309d': [('anat_', 'anat-'), From ff1d2e6d91284c0d634da9d57c140ac40e070a64 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 13:55:39 -0400 Subject: [PATCH 048/181] Add ad-hoc substitutions for old protocol names, discard files that do not start with number --- heuristics/dbic_bids.py | 61 ++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 10e1ad26..9d13ac5a 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -6,6 +6,45 @@ import logging lgr = logging.getLogger('heudiconv') +# dictionary from accession-number to runs that need to be marked as bad +# NOTE: even if filename has number that is 0-padded, internally no padding +# is done +fix_accession2run = { + 'A000067': ['^9-'], + # duplicate files -- use only those that start with numbers + 'A000006': ['^[A-Za-z]'] +} + +# dictionary containing fixes, keys are md5sum of study_description from +# dicoms, in the form of PI-Experimenter^protocolname +# values are list of tuples in the form (regex_pattern, substitution) +protocols2fix = { + '9d148e2a05f782273f6343507733309d': + [('anat_', 'anat-'), + ('run-life[0-9]', 'run+_task-life'), + ('scout_run\+', 'scout'), + # substitutions for old protocol names + ('AAHead_Scout_32ch-head-coil', 'anat-scout'), + ('MPRAGE', 'anat-T1w_acq-MPRAGE_run+'), + ('gre_field_mapping_2mm', 'fmap_acq-2mm'), + ('epi_bold_sms_p2_s4_2mm_life1_748', + 'func_run+_task-life_acq-2mm748'), + ('epi_bold_sms_p2_s4_2mm_life2_692', + 'func_run+_task-life_acq-2mm692'), + ('epi_bold_sms_p2_s4_2mm_life3_754', + 'func_run+_task-life_acq-2mm754'), + ('epi_bold_sms_p2_s4_2mm_life4_824', + 'func_run+_task-life_acq-2mm824'), + ('t2_space_sag_p4_iso', 'anat-T2w'), + ('gre_field_mapping_2.4mm', 'fmap_acq-2.4mm'), + ('rest_p2_sms4_2.4mm_64sl_1000tr_32te_600dyn', + 'func_run+_task-rest_acq-2.4mm64sl1000tr32te600dyn'), + ('DTI_30', 'dwi_run+_acq-30')], + '76b36c80231b0afaf509e2d52046e964': + [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')] +} +keys2replace = ['protocol_name', 'series_description'] + def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), annotation_classes=None, prefix=''): @@ -27,14 +66,6 @@ def md5sum(string): return m.hexdigest() -# dictionary from accession-number to runs that need to be marked as bad -# NOTE: even if filename has number that is 0-padded, internally no padding -# is done -fix_accession2run = { - 'A000067': ['^9-'] -} - - def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): """Function that adds cancelme_ to known bad runs which were forgotten""" accession_number = get_unique(seqinfo, 'accession_number') @@ -52,20 +83,6 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): return seqinfo -# dictionary containing fixes, keys are md5sum of study_description from -# dicoms, in the form of PI-Experimenter^protocolname -# values are list of tuples in the form (regex_pattern, substitution) -protocols2fix = { - '9d148e2a05f782273f6343507733309d': - [('anat_', 'anat-'), - ('run-life[0-9]', 'run+_task-life'), - ('scout_run\+', 'scout')], - '76b36c80231b0afaf509e2d52046e964': - [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')] -} -keys2replace = ['protocol_name', 'series_description'] - - def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): """Ad-hoc fixup for existing protocols""" # add cancelme to known bad runs From 912e036daa3eca952274996b149acb0c416e7be7 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 14:55:57 -0400 Subject: [PATCH 049/181] NF: add filtering based on filename --- bin/heudiconv | 12 +++++++++++- heuristics/dbic_bids.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index 95b46224..869898bb 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -212,7 +212,7 @@ def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False) find_files.__doc__ %= (_VCS_REGEX,) -def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): +def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=False): """Process list of dicoms and return seqinfo and file group `seqinfo` contains per-sequence extract of fields from DICOMs which @@ -222,6 +222,9 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): ---------- fl : list of str List of files to consider + flfilter : callable, optional + Applied to each of fl. Should return True if file needs to be kept, + False otherwise. Used to filter fl dcmfilter : callable, optional If called on dcm_data and returns True, it is used to set series_id @@ -248,6 +251,12 @@ def group_dicoms_into_seqinfos(fl, dcmfilter=None, per_studyUID=False): # "study". If not -- what is the use-case? (interrupted acquisition?) # and how would then we deal with series numbers # which would differ already + if flfilter: + nfl_before = len(fl) + fl = filter(flfilter, fl) + nfl_after = len(fl) + lgr.info('Filtering out {0} dicoms based on their filename'.format( + nfl_before-nfl_after)) for fidx, filename in enumerate(fl): # TODO after getting a regression test check if the same behavior # with stop_before_pixels=True @@ -912,6 +921,7 @@ def convert_dicoms(sid, if dicoms: seqinfo = group_dicoms_into_seqinfos( dicoms, + flfilter=getattr(heuristic, 'filter_files', None), dcmfilter=getattr(heuristic, 'filter_dicom', None)) seqinfo_list = list(seqinfo.keys()) filegroup = {si.series_id: x for si, x in seqinfo.items()} diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 9d13ac5a..1c211784 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -46,6 +46,16 @@ keys2replace = ['protocol_name', 'series_description'] +def filter_files(fn): + """Return True if a file should be kept, else False. + We're using it to filter out files that do not start with a number.""" + + split = fn.split('/') + sequence_dir = split[-2] + + return True if re.match('^[0-9]+-', sequence_dir) else False + + def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), annotation_classes=None, prefix=''): if not subdir: From d7f96ed7011b1bfd8954cb25ad1013bd29932fd5 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 15:01:47 -0400 Subject: [PATCH 050/181] Add filtering also under get_study_sessions --- bin/heudiconv | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/heudiconv b/bin/heudiconv index 869898bb..76b09aad 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1110,6 +1110,7 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, #import pdb; pdb.set_trace() seqinfo_dict = group_dicoms_into_seqinfos( files_, + flfilter=getattr(heuristic, 'filter_files', None), dcmfilter=getattr(heuristic, 'filter_dicom', None), per_studyUID=True) From 56530b54af1b6f56c10c929ddd48874dca324b09 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 15:23:54 -0400 Subject: [PATCH 051/181] Modify replacement --- heuristics/dbic_bids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 1c211784..5d12df91 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -26,7 +26,7 @@ # substitutions for old protocol names ('AAHead_Scout_32ch-head-coil', 'anat-scout'), ('MPRAGE', 'anat-T1w_acq-MPRAGE_run+'), - ('gre_field_mapping_2mm', 'fmap_acq-2mm'), + ('gre_field_mapping_2mm', 'fmap_run+_acq-2mm'), ('epi_bold_sms_p2_s4_2mm_life1_748', 'func_run+_task-life_acq-2mm748'), ('epi_bold_sms_p2_s4_2mm_life2_692', @@ -36,7 +36,7 @@ ('epi_bold_sms_p2_s4_2mm_life4_824', 'func_run+_task-life_acq-2mm824'), ('t2_space_sag_p4_iso', 'anat-T2w'), - ('gre_field_mapping_2.4mm', 'fmap_acq-2.4mm'), + ('gre_field_mapping_2.4mm', 'fmap_run+_acq-2.4mm'), ('rest_p2_sms4_2.4mm_64sl_1000tr_32te_600dyn', 'func_run+_task-rest_acq-2.4mm64sl1000tr32te600dyn'), ('DTI_30', 'dwi_run+_acq-30')], From a2fe68597e3494a2555b22d39022e85f5695417f Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 15:30:39 -0400 Subject: [PATCH 052/181] Do not complain if magnitude is not present for one of the first studies --- heuristics/dbic_bids.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 5d12df91..392d64eb 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -224,7 +224,11 @@ def infotodict(seqinfo): # sequences -- e.g. one for magnitude(s) and other ones for # phases. In those we must not increment run! if image_data_type == 'P': - if prev_image_data_type != 'M': + # XXX: check that study description is not one of the first one; + # this is needed because at the beginning only + # phasediff was acquired + if prev_image_data_type != 'M' and \ + md5sum(s.study_description) != '9d148e2a05f782273f6343507733309d': raise RuntimeError("Was expecting phase image to follow magnitude image, but previous one was %r", prev_image_data_type) # else we do nothing special else: # and otherwise we go to the next run From 0314c5785def5f0f55f491086690c59cb27bb77e Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 15:43:11 -0400 Subject: [PATCH 053/181] Rename test directory because it should start with a number --- ....1107.5.2.43.66112.2016101409263663466202201.dcm | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/data/{fmap_acq-3mm => 01-fmap_acq-3mm}/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm (100%) diff --git a/tests/data/fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm b/tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm similarity index 100% rename from tests/data/fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm rename to tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm From 8cc9814ae98ba174a21655db4287514e8211140f Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 15:53:22 -0400 Subject: [PATCH 054/181] Fix directory for test --- tests/test_tarballs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tarballs.py b/tests/test_tarballs.py index 1b8c8de8..3a9a9edf 100644 --- a/tests/test_tarballs.py +++ b/tests/test_tarballs.py @@ -18,7 +18,7 @@ def test_reproducibility(tmpdir): #heudiconv.compress_dicoms(dicom_list, prefix, sourcedir) prefix = str(tmpdir.join("precious")) - args = [glob(opj(tests_datadir, 'fmap_acq-3mm', '*')), prefix] + args = [glob(opj(tests_datadir, '01-fmap_acq-3mm', '*')), prefix] tarball = heudiconv.compress_dicoms(*args) md5 = md5sum(tarball) assert tarball From 994610f17548970c5e5e00d6138bde58e2f498a8 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 17:05:38 -0400 Subject: [PATCH 055/181] Fix on phasediff for one of the first studies --- heuristics/dbic_bids.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 392d64eb..968a15b1 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -224,11 +224,12 @@ def infotodict(seqinfo): # sequences -- e.g. one for magnitude(s) and other ones for # phases. In those we must not increment run! if image_data_type == 'P': - # XXX: check that study description is not one of the first one; - # this is needed because at the beginning only - # phasediff was acquired - if prev_image_data_type != 'M' and \ - md5sum(s.study_description) != '9d148e2a05f782273f6343507733309d': + if prev_image_data_type != 'M': + # XXX: check that study description is not one of the first one; + # this is needed because at the beginning only phasediff was acquired + # and we want to save both anyway + if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': + current_run += 1 raise RuntimeError("Was expecting phase image to follow magnitude image, but previous one was %r", prev_image_data_type) # else we do nothing special else: # and otherwise we go to the next run From 913a55c4cbb4487b699bb834dac2a98a96f2e705 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 17:06:00 -0400 Subject: [PATCH 056/181] Fix on phasediff for one of the first studies --- heuristics/dbic_bids.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 968a15b1..8fe2ae25 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -230,7 +230,8 @@ def infotodict(seqinfo): # and we want to save both anyway if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': current_run += 1 - raise RuntimeError("Was expecting phase image to follow magnitude image, but previous one was %r", prev_image_data_type) + else: + raise RuntimeError("Was expecting phase image to follow magnitude image, but previous one was %r", prev_image_data_type) # else we do nothing special else: # and otherwise we go to the next run current_run += 1 From e3d973ecc1e8c478aab9b61eae87ce16ef727096 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 19:58:30 -0400 Subject: [PATCH 057/181] Fix logic for phasediff in some study: always increase runnr --- heuristics/dbic_bids.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 8fe2ae25..5d1eccfb 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -220,21 +220,24 @@ def infotodict(seqinfo): if run is not None: # so we have an indicator for a run if run == '+': - # some sequences, e.g. fmap, would generate two (or more?) - # sequences -- e.g. one for magnitude(s) and other ones for - # phases. In those we must not increment run! - if image_data_type == 'P': - if prev_image_data_type != 'M': - # XXX: check that study description is not one of the first one; - # this is needed because at the beginning only phasediff was acquired - # and we want to save both anyway - if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': - current_run += 1 - else: - raise RuntimeError("Was expecting phase image to follow magnitude image, but previous one was %r", prev_image_data_type) - # else we do nothing special - else: # and otherwise we go to the next run - current_run += 1 + # XXX if we have a known earlier study, we need to always + # increase the run counter for phasediff because magnitudes + # were not acquired + if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': + if image_data_type == 'P' and prev_image_data_type != 'M': + current_run += 1 + else: + # some sequences, e.g. fmap, would generate two (or more?) + # sequences -- e.g. one for magnitude(s) and other ones for + # phases. In those we must not increment run! + if image_data_type == 'P': + if prev_image_data_type != 'M': + raise RuntimeError( + "Was expecting phase image to follow magnitude " + "image, but previous one was %r", prev_image_data_type) + # else we do nothing special + else: # and otherwise we go to the next run + current_run += 1 elif run == '=': if not current_run: current_run = 1 From 3e94ff582ad96181f76343c2eda6122f45ee4d90 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 19:59:00 -0400 Subject: [PATCH 058/181] One line less --- heuristics/dbic_bids.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 5d1eccfb..f45bf14a 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -83,8 +83,7 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): badruns = accession2run[accession_number] badruns_pattern = '|'.join(badruns) for i, s in enumerate(seqinfo): - match = re.match(badruns_pattern, s.series_id) - if match: + if re.match(badruns_pattern, s.series_id): lgr.info('Fixing bad run {0}'.format(s.series_id)) fixedkwargs = dict() for key in keys2replace: From 660f286950257cc34c6843a7a904b976966035d8 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 20:03:59 -0400 Subject: [PATCH 059/181] Use os.path.split to split fn --- heuristics/dbic_bids.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index f45bf14a..fd5bd5e8 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -50,8 +50,9 @@ def filter_files(fn): """Return True if a file should be kept, else False. We're using it to filter out files that do not start with a number.""" - split = fn.split('/') - sequence_dir = split[-2] + split = os.path.split(fn) + split2 = os.path.split(split[0]) + sequence_dir = split2[1] return True if re.match('^[0-9]+-', sequence_dir) else False @@ -470,6 +471,11 @@ def fixup_subjectid(subjectid): return "sid%06d" % int(reg.groups()[0]) +def test_filter_files(): + assert(filter_files('/home/mvdoc/dbic/09-run_func_meh/0123432432.dcm')) + assert(not filter_files('/home/mvdoc/dbic/run_func_meh/012343143.dcm')) + + def test_md5sum(): assert md5sum('cryptonomicon') == '1cd52edfa41af887e14ae71d1db96ad1' assert md5sum('mysecretmessage') == '07989808231a0c6f522f9d8e34695794' From cd16ede72380946924f4e5ef93ee590ac3b973b3 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 20:17:57 -0400 Subject: [PATCH 060/181] Maybe this time I got the logic right for phasediff --- heuristics/dbic_bids.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index fd5bd5e8..4e6082bf 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -220,24 +220,23 @@ def infotodict(seqinfo): if run is not None: # so we have an indicator for a run if run == '+': - # XXX if we have a known earlier study, we need to always - # increase the run counter for phasediff because magnitudes - # were not acquired - if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': - if image_data_type == 'P' and prev_image_data_type != 'M': + # some sequences, e.g. fmap, would generate two (or more?) + # sequences -- e.g. one for magnitude(s) and other ones for + # phases. In those we must not increment run! + if image_data_type == 'P': + # XXX if we have a known earlier study, we need to always + # increase the run counter for phasediff because magnitudes + # were not acquired + if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': current_run += 1 - else: - # some sequences, e.g. fmap, would generate two (or more?) - # sequences -- e.g. one for magnitude(s) and other ones for - # phases. In those we must not increment run! - if image_data_type == 'P': + else: if prev_image_data_type != 'M': raise RuntimeError( "Was expecting phase image to follow magnitude " "image, but previous one was %r", prev_image_data_type) # else we do nothing special - else: # and otherwise we go to the next run - current_run += 1 + else: # and otherwise we go to the next run + current_run += 1 elif run == '=': if not current_run: current_run = 1 From 1a8c1e7828fe8daa65579f682fa3f75eab5d6f29 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 3 Nov 2016 20:23:57 -0400 Subject: [PATCH 061/181] Some minor fixups --- heuristics/dbic_bids.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 4e6082bf..27917fe6 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -11,8 +11,6 @@ # is done fix_accession2run = { 'A000067': ['^9-'], - # duplicate files -- use only those that start with numbers - 'A000006': ['^[A-Za-z]'] } # dictionary containing fixes, keys are md5sum of study_description from @@ -23,6 +21,7 @@ [('anat_', 'anat-'), ('run-life[0-9]', 'run+_task-life'), ('scout_run\+', 'scout'), + ('T2w', 'T2w_run+'), # substitutions for old protocol names ('AAHead_Scout_32ch-head-coil', 'anat-scout'), ('MPRAGE', 'anat-T1w_acq-MPRAGE_run+'), @@ -35,7 +34,7 @@ 'func_run+_task-life_acq-2mm754'), ('epi_bold_sms_p2_s4_2mm_life4_824', 'func_run+_task-life_acq-2mm824'), - ('t2_space_sag_p4_iso', 'anat-T2w'), + ('t2_space_sag_p4_iso', 'anat-T2w_run+'), ('gre_field_mapping_2.4mm', 'fmap_run+_acq-2.4mm'), ('rest_p2_sms4_2.4mm_64sl_1000tr_32te_600dyn', 'func_run+_task-rest_acq-2.4mm64sl1000tr32te600dyn'), From a511081878db1c05b21a0094a6d915a3d4da0d29 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 4 Nov 2016 12:26:30 -0400 Subject: [PATCH 062/181] Add some more cancel runs --- heuristics/dbic_bids.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 27917fe6..5b82e417 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -11,6 +11,11 @@ # is done fix_accession2run = { 'A000067': ['^9-'], + 'A000072': ['^5-'], + 'A000081': ['^5-'], + 'A000082': ['^5-'], + 'A000088': ['^9-'], + 'A000090': ['^5-'], } # dictionary containing fixes, keys are md5sum of study_description from From 8b79ebac1e039cb14fa494c5c91085f501253d44 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 4 Nov 2016 12:31:06 -0400 Subject: [PATCH 063/181] Add filtering of dicom for one particular studyinstanceid --- heuristics/dbic_bids.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 5b82e417..fc706907 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -49,6 +49,16 @@ } keys2replace = ['protocol_name', 'series_description'] +# list containing StudyInstanceID to skip -- hopefully doesn't happen too often +dicoms2skip = [ + '1.3.12.2.1107.5.2.43.66112.30000016110117002435700000001' +] + + +def filter_dicom(dcmdata): + """Return True if a DICOM dataset should be filtered out, else False""" + return True if dcmdata.StudyInstanceUID in dicoms2skip else False + def filter_files(fn): """Return True if a file should be kept, else False. From 40347c68c8cfd485c10c9676edf29b8bc7b0517b Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 4 Nov 2016 18:06:19 -0400 Subject: [PATCH 064/181] Add more ad hoc fixes for known protocols --- heuristics/dbic_bids.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index fc706907..9d23f07e 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -10,6 +10,8 @@ # NOTE: even if filename has number that is 0-padded, internally no padding # is done fix_accession2run = { + 'A000005': ['^1-'], + 'A000035': ['^8-', '^9-'], 'A000067': ['^9-'], 'A000072': ['^5-'], 'A000081': ['^5-'], @@ -31,6 +33,7 @@ ('AAHead_Scout_32ch-head-coil', 'anat-scout'), ('MPRAGE', 'anat-T1w_acq-MPRAGE_run+'), ('gre_field_mapping_2mm', 'fmap_run+_acq-2mm'), + ('gre_field_mapping_3mm', 'fmap_run+_acq-3mm'), ('epi_bold_sms_p2_s4_2mm_life1_748', 'func_run+_task-life_acq-2mm748'), ('epi_bold_sms_p2_s4_2mm_life2_692', @@ -39,13 +42,23 @@ 'func_run+_task-life_acq-2mm754'), ('epi_bold_sms_p2_s4_2mm_life4_824', 'func_run+_task-life_acq-2mm824'), + ('epi_bold_p2_3mm_nofs_life1_374', + 'func_run+_task-life_acq-3mmnofs374'), + ('epi_bold_p2_3mm_nofs_life2_346', + 'func_run+_task-life_acq-3mmnofs346'), + ('epi_bold_p2_3mm_nofs_life3_377', + 'func_run+_task-life_acq-3mmnofs377'), + ('epi_bold_p2_3mm_nofs_life4_412', + 'func_run+_task-life_acq-3mmnofs412'), ('t2_space_sag_p4_iso', 'anat-T2w_run+'), ('gre_field_mapping_2.4mm', 'fmap_run+_acq-2.4mm'), ('rest_p2_sms4_2.4mm_64sl_1000tr_32te_600dyn', 'func_run+_task-rest_acq-2.4mm64sl1000tr32te600dyn'), ('DTI_30', 'dwi_run+_acq-30')], '76b36c80231b0afaf509e2d52046e964': - [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')] + [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')], + 'c6d8fbccc72990bee61d28e73b2618a4': + [('run=', 'run+')] } keys2replace = ['protocol_name', 'series_description'] @@ -64,11 +77,23 @@ def filter_files(fn): """Return True if a file should be kept, else False. We're using it to filter out files that do not start with a number.""" + # do not check for these accession numbers because they haven't been + # recopied with the initial number + donotfilter = ['A000012', 'A000013', 'A000041'] + split = os.path.split(fn) split2 = os.path.split(split[0]) sequence_dir = split2[1] - - return True if re.match('^[0-9]+-', sequence_dir) else False + accession_number = os.path.split(split2[0]) + if accession_number == 'A000043': + # crazy one that got copied for some runs but not for others, + # so we are going to discard those that got copied and let heudiconv + # figure out the rest + return False if re.match('^[0-9]+-', sequence_dir) else True + elif accession_number in donotfilter: + return True + else: + return True if re.match('^[0-9]+-', sequence_dir) else False def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), From 9c321d82a9c39119ea8b4bef72b470d1572c4b01 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 4 Nov 2016 18:12:03 -0400 Subject: [PATCH 065/181] Fix accession number in filter_files --- heuristics/dbic_bids.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 9d23f07e..f9849a87 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -84,7 +84,8 @@ def filter_files(fn): split = os.path.split(fn) split2 = os.path.split(split[0]) sequence_dir = split2[1] - accession_number = os.path.split(split2[0]) + split3 = os.path.split(split2[0]) + accession_number = split3[0] if accession_number == 'A000043': # crazy one that got copied for some runs but not for others, # so we are going to discard those that got copied and let heudiconv From cd99edea594bddae5cd2ce3ef223bf7c018c1cf5 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 4 Nov 2016 18:13:31 -0400 Subject: [PATCH 066/181] Really fix that --- heuristics/dbic_bids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index f9849a87..1cb14df5 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -85,7 +85,7 @@ def filter_files(fn): split2 = os.path.split(split[0]) sequence_dir = split2[1] split3 = os.path.split(split2[0]) - accession_number = split3[0] + accession_number = split3[1] if accession_number == 'A000043': # crazy one that got copied for some runs but not for others, # so we are going to discard those that got copied and let heudiconv @@ -650,4 +650,4 @@ def test_parse_dbic_protocol_name(): 'seqtype': 'anat', 'seqtype_label': 'scout', 'session': '+', - } \ No newline at end of file + } From 53c2fecd7cc8e3835387c58d7a4ff7753458e20d Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 4 Nov 2016 21:12:10 -0400 Subject: [PATCH 067/181] Another fix for logic of phasediff --- heuristics/dbic_bids.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 1cb14df5..8f0c18be 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -264,13 +264,13 @@ def infotodict(seqinfo): # sequences -- e.g. one for magnitude(s) and other ones for # phases. In those we must not increment run! if image_data_type == 'P': - # XXX if we have a known earlier study, we need to always - # increase the run counter for phasediff because magnitudes - # were not acquired - if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': - current_run += 1 - else: - if prev_image_data_type != 'M': + if prev_image_data_type != 'M': + # XXX if we have a known earlier study, we need to always + # increase the run counter for phasediff because magnitudes + # were not acquired + if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': + current_run += 1 + else: raise RuntimeError( "Was expecting phase image to follow magnitude " "image, but previous one was %r", prev_image_data_type) From 1272721c0872fe2b8e3f00666766fbf790497092 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 5 Nov 2016 13:22:30 -0400 Subject: [PATCH 068/181] Add one more StudyInstanceUID to skip --- heuristics/dbic_bids.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 8f0c18be..e4692e30 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -62,9 +62,10 @@ } keys2replace = ['protocol_name', 'series_description'] -# list containing StudyInstanceID to skip -- hopefully doesn't happen too often +# list containing StudyInstanceUID to skip -- hopefully doesn't happen too often dicoms2skip = [ - '1.3.12.2.1107.5.2.43.66112.30000016110117002435700000001' + '1.3.12.2.1107.5.2.43.66112.30000016110117002435700000001', + '1.3.12.2.1107.5.2.43.66112.30000016102813152550600000004', # double scout ] From a994d7f6da68293d4c7b5b1a2becea2dc0f3d222 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 6 Nov 2016 12:06:34 -0500 Subject: [PATCH 069/181] Add seqtype_label for dwi sequences as well --- heuristics/dbic_bids.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index e4692e30..655e84b0 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -257,6 +257,10 @@ def infotodict(seqinfo): 'P': 'phasediff' }[image_data_type] + # label for dwi as well + if seqtype == 'dwi' and not seqtype_label: + seqtype_label = 'dwi' + run = regd.get('run') if run is not None: # so we have an indicator for a run From 2466cefac84dc5d2a11efcba7e3eacf415929257 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 6 Nov 2016 15:10:22 -0500 Subject: [PATCH 070/181] Filter out files for one study that had no study description --- heuristics/dbic_bids.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 655e84b0..849db98a 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -92,6 +92,10 @@ def filter_files(fn): # so we are going to discard those that got copied and let heudiconv # figure out the rest return False if re.match('^[0-9]+-', sequence_dir) else True + elif accession_number == 'unknown': + # this one had some stuff without study description, filter stuff before + # collecting info, so it doesn't crash completely + return False if re.match('^[34][07-9]-sn', sequence_dir) else True elif accession_number in donotfilter: return True else: From 766d5feae5cc3dae5165de58dcd10a26129302e5 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 6 Nov 2016 15:18:55 -0500 Subject: [PATCH 071/181] Avoid increasing run number if sequence is derived --- heuristics/dbic_bids.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 849db98a..08dc9819 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -285,7 +285,10 @@ def infotodict(seqinfo): "image, but previous one was %r", prev_image_data_type) # else we do nothing special else: # and otherwise we go to the next run - current_run += 1 + # increase run number only if it's not derived, + # so we avoid having a ludicrous number of runs + if not s.is_derived: + current_run += 1 elif run == '=': if not current_run: current_run = 1 From f4e8e85863cfa9c348b06b2ddde8a046c3f7619a Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 6 Nov 2016 15:23:57 -0500 Subject: [PATCH 072/181] Skip derived sequences completely --- heuristics/dbic_bids.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 08dc9819..c02235a7 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -194,6 +194,10 @@ def infotodict(seqinfo): run_label = None # run- image_data_type = None for s in seqinfo: + # XXX: skip derived sequences, we don't store them to avoid polluting + # the directory + if s.is_derived: + continue template = None suffix = '' seq = [] @@ -235,17 +239,18 @@ def infotodict(seqinfo): "Deduced seqtype to be %s from DICOM, but got %s out of %s", image_type_seqtype, seqtype, protocol_name_tuned) - if s.is_derived: - # Let's for now stash those close to original images - # TODO: we might want a separate tree for all of this!? - # so more of a parameter to the create_key - #seqtype += '/derivative' - # just keep it lower case and without special characters - # XXXX what for??? - #seq.append(s.series_description.lower()) - prefix = os.path.join('derivatives', 'scanner') - else: - prefix = '' + # if s.is_derived: + # # Let's for now stash those close to original images + # # TODO: we might want a separate tree for all of this!? + # # so more of a parameter to the create_key + # #seqtype += '/derivative' + # # just keep it lower case and without special characters + # # XXXX what for??? + # #seq.append(s.series_description.lower()) + # prefix = os.path.join('derivatives', 'scanner') + # else: + # prefix = '' + prefix = '' # analyze s.protocol_name (series_id is based on it) for full name mapping etc if seqtype == 'func' and not seqtype_label: @@ -285,10 +290,7 @@ def infotodict(seqinfo): "image, but previous one was %r", prev_image_data_type) # else we do nothing special else: # and otherwise we go to the next run - # increase run number only if it's not derived, - # so we avoid having a ludicrous number of runs - if not s.is_derived: - current_run += 1 + current_run += 1 elif run == '=': if not current_run: current_run = 1 From f9d376bb9eaa0e288f46324ac8412d77abab84f6 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 6 Nov 2016 15:26:05 -0500 Subject: [PATCH 073/181] Be more verbose about skipping derived --- heuristics/dbic_bids.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index c02235a7..29d095af 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -197,6 +197,8 @@ def infotodict(seqinfo): # XXX: skip derived sequences, we don't store them to avoid polluting # the directory if s.is_derived: + skipped.append(s.series_id) + lgr.debug("Ignoring derived data %s", s.series_id) continue template = None suffix = '' From 591f90168d069f86faa7ffda51b52dd227f222cc Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 6 Nov 2016 17:18:59 -0500 Subject: [PATCH 074/181] BF, XXX: add ad-hoc fix in heudiconv for one study that had two StudyInstanceUID. Remove in the future --- bin/heudiconv | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/bin/heudiconv b/bin/heudiconv index 76b09aad..394cbfa1 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -421,6 +421,19 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F else: seqinfo[info] = files + # XXX: this is a one-time fix for a study which got two StudyID despite + # having everything else similar. As such this should be removed in the + # future. May our future selves be kind on us. Matteo doesn't like this + # fix but it's quick and very dirty. + _fixstudyuid1 = '1.3.12.2.1107.5.2.43.66112.30000016101414585478800001750' + _fixstudyuid2 = '1.3.12.2.1107.5.2.43.66112.30000016101414585478800000236' + if _fixstudyuid1 in seqinfo and _fixstudyuid2 in seqinfo and len(seqinfo) == 2: + # now we need to put everything in one single dictionary + for key, value in seqinfo[_fixstudyuid2].iteritems(): + seqinfo[_fixstudyuid1][key] = value + del seqinfo[_fixstudyuid2] + lgr.warn("FIXED UP StudyInstanceUID") + if per_studyUID: lgr.info("Generated sequence info for %d studies with %d entries total", len(seqinfo), sum(map(len, seqinfo.values()))) From 4677849a0fb9a26592f95f11d3679764f57d5205 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 6 Nov 2016 17:32:30 -0500 Subject: [PATCH 075/181] Rename one more sequence --- heuristics/dbic_bids.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 29d095af..4b4b36d5 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -54,7 +54,8 @@ ('gre_field_mapping_2.4mm', 'fmap_run+_acq-2.4mm'), ('rest_p2_sms4_2.4mm_64sl_1000tr_32te_600dyn', 'func_run+_task-rest_acq-2.4mm64sl1000tr32te600dyn'), - ('DTI_30', 'dwi_run+_acq-30')], + ('DTI_30', 'dwi_run+_acq-30'), + ('t1_space_sag_p2_iso', 'anat-T1w_acq-060mm_run+')], '76b36c80231b0afaf509e2d52046e964': [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')], 'c6d8fbccc72990bee61d28e73b2618a4': From 2f5d328987ef9e4f151a94803c810bb6a534a41d Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Mon, 7 Nov 2016 09:02:41 -0500 Subject: [PATCH 076/181] Avoid fixing up in heudiconv.py, we'll do it manually later --- bin/heudiconv | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 394cbfa1..76b09aad 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -421,19 +421,6 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F else: seqinfo[info] = files - # XXX: this is a one-time fix for a study which got two StudyID despite - # having everything else similar. As such this should be removed in the - # future. May our future selves be kind on us. Matteo doesn't like this - # fix but it's quick and very dirty. - _fixstudyuid1 = '1.3.12.2.1107.5.2.43.66112.30000016101414585478800001750' - _fixstudyuid2 = '1.3.12.2.1107.5.2.43.66112.30000016101414585478800000236' - if _fixstudyuid1 in seqinfo and _fixstudyuid2 in seqinfo and len(seqinfo) == 2: - # now we need to put everything in one single dictionary - for key, value in seqinfo[_fixstudyuid2].iteritems(): - seqinfo[_fixstudyuid1][key] = value - del seqinfo[_fixstudyuid2] - lgr.warn("FIXED UP StudyInstanceUID") - if per_studyUID: lgr.info("Generated sequence info for %d studies with %d entries total", len(seqinfo), sum(map(len, seqinfo.values()))) From 48fa3fc0d6fb48c5d6058df36d042ef152610565 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 8 Nov 2016 14:55:40 -0500 Subject: [PATCH 077/181] ENH/BF: allow for accessions starting with phantom-, sanitize subjectid to not have _ or - --- heuristics/dbic_bids.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 4b4b36d5..7e9c4ee2 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -99,6 +99,9 @@ def filter_files(fn): return False if re.match('^[34][07-9]-sn', sequence_dir) else True elif accession_number in donotfilter: return True + elif accession_number.startswith('phantom-'): + # Accessions on phantoms, e.g. in dartmouth-phantoms/bids_test4-20161014 + return True else: return True if re.match('^[0-9]+-', sequence_dir) else False @@ -523,7 +526,8 @@ def fixup_subjectid(subjectid): reg = re.match("sid0*(\d+)$", subjectid) if not reg: # some completely other pattern - return subjectid + # just filter out possible _- in it + return re.sub('[-_]', '', subjectid) return "sid%06d" % int(reg.groups()[0]) From 64989469b2b042d6237c38fd68d676aa6fb901c0 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 8 Nov 2016 16:52:15 -0500 Subject: [PATCH 078/181] ENH: remove _ and - during sanitization, do that to angio suffixes --- heuristics/dbic_bids.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 7e9c4ee2..fbf4c867 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -231,7 +231,7 @@ def infotodict(seqinfo): regd = parse_dbic_protocol_name(protocol_name_tuned) if image_data_type.startswith('MIP'): - regd['acq'] = regd.get('acq', '') + image_data_type + regd['acq'] = regd.get('acq', '') + sanitize_str(image_data_type) if not regd: skipped_unknown.append(s.series_id) @@ -344,6 +344,7 @@ def infotodict(seqinfo): # suffix += 'seq-%s' % ('+'.join(seq)) # some are ok to skip and not to whine + if "_Scout" in s.series_description or \ (seqtype == 'anat' and seqtype_label == 'scout'): skipped.append(s.series_id) @@ -440,7 +441,7 @@ def infotoids(seqinfos, outdir): def sanitize_str(value): """Remove illegal characters for BIDS from task/acq/etc..""" - return value.translate(None, '#!@$%^&.,:;') + return value.translate(None, '#!@$%^&.,:;_-') def parse_dbic_protocol_name(protocol_name): From 52cfa749440102a2d4c1bcddcd0ee6f7e5439890 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 12 Nov 2016 20:43:35 -0500 Subject: [PATCH 079/181] Impose session naming for one protocol --- heuristics/dbic_bids.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index fbf4c867..7778f3a7 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -381,6 +381,7 @@ def infotoids(seqinfos, outdir): # decide on subjid and session based on patient_id lgr.info("Processing sequence infos to deduce study/session") study_description = get_unique(seqinfos, 'study_description') + study_description_hash = md5sum(study_description) subject = fixup_subjectid(get_unique(seqinfos, 'patient_id')) # TODO: fix up subject id if missing some 0s split = study_description.split('^', 1) @@ -430,6 +431,9 @@ def infotoids(seqinfos, outdir): # Let's be lazy for now just to get somewhere session = '001' + if study_description_hash == '9d148e2a05f782273f6343507733309': + session = 'siemens1' + return { # TODO: request info on study from the JedCap 'locator': locator, From 85b470585fd5f8995d10ce1e8ce1e56d80c6cbbe Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 12 Nov 2016 20:50:28 -0500 Subject: [PATCH 080/181] Be verbose --- heuristics/dbic_bids.py | 1 + 1 file changed, 1 insertion(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 7778f3a7..4618b614 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -433,6 +433,7 @@ def infotoids(seqinfos, outdir): if study_description_hash == '9d148e2a05f782273f6343507733309': session = 'siemens1' + lgr.info('Imposing session {0}'.format(session)) return { # TODO: request info on study from the JedCap From be95291ced5dadb88df379a663c8b52c0e494b0e Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 12 Nov 2016 21:20:42 -0500 Subject: [PATCH 081/181] Fix hash --- bin/heudiconv | 2 +- heuristics/dbic_bids.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 76b09aad..8007a2a8 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1573,4 +1573,4 @@ def main(argv=None): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 4618b614..6d44de8f 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -431,7 +431,7 @@ def infotoids(seqinfos, outdir): # Let's be lazy for now just to get somewhere session = '001' - if study_description_hash == '9d148e2a05f782273f6343507733309': + if study_description_hash == '9d148e2a05f782273f6343507733309d': session = 'siemens1' lgr.info('Imposing session {0}'.format(session)) From 355864509846e764f328fabc7a82227e05fafd0e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 22 Nov 2016 18:54:37 -0500 Subject: [PATCH 082/181] dbic_bids: if the same template has multiple entries -- canceled runs since we sort now first anyways, there should be no collisions AFAIK and if there are it means that previous runs were either incorrectly numbered or canceled/repeated. All sequences should have had unique names --- heuristics/dbic_bids.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 6d44de8f..836dae17 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -359,6 +359,16 @@ def infotodict(seqinfo): if skipped_unknown: lgr.warning("Could not figure out where to stick %d sequences: %s" % (len(skipped_unknown), skipped_unknown)) + # analyze for "cancelled" runs, if run number was explicitly specified and + # thus we ended up with multiple entries which would mean that older ones + # were "cancelled" + for template in info: + series_ids = info[template] + if len(series_ids) > 1: + lgr.warning("Detected %d canceled run(s) for template %s: %s", + len(series_ids)-1, template[0], series_ids[:-1]) + info[template] = series_ids[-1:] + assert len(info[template]) == 1 return info From 4fe14fb0b3e335cfdb2f3ef445e3c84ec57f732e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 22 Nov 2016 19:21:28 -0500 Subject: [PATCH 083/181] RF: lower some log messages to DEBUG level (yet to figure out nipype) + RF of some code --- bin/heudiconv | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 8007a2a8..8572f541 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -51,7 +51,6 @@ logging.basicConfig( ) lgr.debug("Starting the abomination") # just to "run-test" logging - global_options = { 'overwrite': False # overwrite existing files } @@ -111,7 +110,7 @@ class TempDirs(object): self.cleanup() def cleanup(self): - lgr.info("Removing %d temporary directories", len(self.dirs)) + lgr.debug("Removing %d temporary directories", len(self.dirs)) for t in self.dirs[:]: lgr.debug("Removing %s", t) if self: @@ -610,25 +609,43 @@ def safe_copyfile(src, dest): def convert(items, symlink=True, converter=None, scaninfo_suffix='.json', custom_callable=None, with_prov=False, is_bids=False, sourcedir=None, outdir=None): + """Performs actual convertion (calls to converter etc) given info from + heuristic's `infotodict` + + Parameters + ---------- + items + symlink + converter + scaninfo_suffix + custom_callable + with_prov + is_bids + sourcedir + outdir + + Returns + ------- + None + """ prov_files = [] tmpdir = mkdtemp(prefix='heudiconvtmp') for item_idx, item in enumerate(items): - if isinstance(item[1], (list, tuple)): - outtypes = item[1] - else: - outtypes = [item[1]] - prefix = item[0] + prefix, outtypes, item_dicoms = item[:3] + if not isinstance(outtypes, (list, tuple)): + outtypes = [outtypes] + prefix_dirname = os.path.dirname(prefix + '.ext') prov_file = None outname_bids = prefix + '.json' outname_bids_files = [] # actual bids files since dcm2niix might generate multiple ATM - lgr.info('Converting %s -> %s . Converter: %s', - prefix, prefix_dirname, converter) + lgr.info('Converting %s (%d DICOMs) -> %s . ' + 'Converter: %s . Output types: %s', + prefix, len(item_dicoms), prefix_dirname, converter, outtypes) if not os.path.exists(prefix_dirname): os.makedirs(prefix_dirname) for outtype in outtypes: - item_dicoms = item[2] - lgr.info("Processing %d dicoms for output type %s", + lgr.debug("Processing %d dicoms for output type %s", len(item_dicoms), outtype) lgr.log(1, " those dicoms are: %s", item_dicoms) @@ -777,7 +794,6 @@ def convert(items, symlink=True, converter=None, scaninfo, tmpdir, with_prov) os.chmod(outname, 0o0440) - if custom_callable is not None: custom_callable(*item) shutil.rmtree(tmpdir) @@ -806,7 +822,7 @@ def tuneup_bids_json_files(json_files): # TODO: we might want to reorder them since ATM # the one for shorter TE is the 2nd one! # For now just save truthfully by loading magnitude files - lgr.info("Placing EchoTime fields into phasediff file") + lgr.debug("Placing EchoTime fields into phasediff file") for i in 1, 2: json_['EchoTime%d' % i] = \ json.load(open(json_basename + '_magnitude%d.json' % i))[ From 37045b4caba08c2063fe6eef27f6e8088be29577 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 29 Nov 2016 14:49:01 -0500 Subject: [PATCH 084/181] another accession # to ignore (that was Matteo's change ;) ) --- heuristics/dbic_bids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 836dae17..e0a78c7c 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -81,7 +81,7 @@ def filter_files(fn): # do not check for these accession numbers because they haven't been # recopied with the initial number - donotfilter = ['A000012', 'A000013', 'A000041'] + donotfilter = ['A000012', 'A000013', 'A000020', 'A000041'] split = os.path.split(fn) split2 = os.path.split(split[0]) From 05359d2b11e3104aee3729453454a273c361c643 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 29 Nov 2016 15:19:50 -0500 Subject: [PATCH 085/181] RF+ENH: instead of dropping duplicate sequences, keep them with __dup suffix for post-mortem analysis/renaming it happend that canceled reran run was reran from a different run copy, so it collided and needed to be used by renaming it manually... but that was not possible since we did not store those duplicates before --- .travis.yml | 2 +- heuristics/dbic_bids.py | 30 +++++++++++++++++++++++++----- heuristics/test_dbic_bids.py | 25 +++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 heuristics/test_dbic_bids.py diff --git a/.travis.yml b/.travis.yml index 23d996bd..6f4406b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ install: script: # - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests - - coverage run `which py.test` -s -v tests + - coverage run `which py.test` -s -v tests heuristics after_success: - codecov diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index e0a78c7c..dd6f2d6d 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -1,6 +1,6 @@ import os import re -from collections import defaultdict +from collections import OrderedDict import hashlib import logging @@ -192,7 +192,7 @@ def infotodict(seqinfo): lgr.info("Processing %d seqinfo entries", len(seqinfo)) and_dicom = ('dicom', 'nii.gz') - info = defaultdict(list) + info = OrderedDict() skipped, skipped_unknown = [], [] current_run = 0 run_label = None # run- @@ -351,22 +351,42 @@ def infotodict(seqinfo): lgr.debug("Ignoring %s", s.series_id) else: template = create_key(seqtype, suffix, prefix=prefix) + # we wanted ordered dict for consistent demarcation of dups + if template not in info: + info[template] = [] info[template].append(s.series_id) - info = dict(info) # convert to dict since outside functionality depends on it being a basic dict if skipped: lgr.info("Skipped %d sequences: %s" % (len(skipped), skipped)) if skipped_unknown: lgr.warning("Could not figure out where to stick %d sequences: %s" % (len(skipped_unknown), skipped_unknown)) + + info = get_dups_marked(info) # mark duplicate ones with __dup0x suffix + + info = dict(info) # convert to dict since outside functionality depends on it being a basic dict + return info + + +def get_dups_marked(info): # analyze for "cancelled" runs, if run number was explicitly specified and # thus we ended up with multiple entries which would mean that older ones # were "cancelled" + info = info.copy() + dup_id = 0 for template in info: series_ids = info[template] if len(series_ids) > 1: - lgr.warning("Detected %d canceled run(s) for template %s: %s", - len(series_ids)-1, template[0], series_ids[:-1]) + lgr.warning("Detected %d duplicated run(s) for template %s: %s", + len(series_ids) - 1, template[0], series_ids[:-1]) + # copy the duplicate ones into separate ones + for dup_series_id in series_ids[:-1]: + dup_id += 1 + dup_template = ('%s__dup%02d' % ( + template[0], dup_id),) + template[1:] + # There must have not been such a beast before! + assert dup_template not in info + info[dup_template] = [dup_series_id] info[template] = series_ids[-1:] assert len(info[template]) == 1 return info diff --git a/heuristics/test_dbic_bids.py b/heuristics/test_dbic_bids.py new file mode 100644 index 00000000..47d722f8 --- /dev/null +++ b/heuristics/test_dbic_bids.py @@ -0,0 +1,25 @@ +# +# Tests for dbic_bids.py +# +from collections import OrderedDict +from dbic_bids import get_dups_marked + +def test_get_dups_marked(): + no_dups = {('some',): [1]} + assert get_dups_marked(no_dups) == no_dups + + assert get_dups_marked( + OrderedDict([ + (('bu', 'du'), [1, 2]), + (('smth',), [3]), + (('smth2',), ['a', 'b', 'c']) + ])) == \ + { + ('bu__dup01', 'du'): [1], + ('bu', 'du'): [2], + ('smth',): [3], + ('smth2__dup02',): ['a'], + ('smth2__dup03',): ['b'], + ('smth2',): ['c'] + } + From aa67c291f90f9f80bebab437da472ae2205dcc7e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 29 Nov 2016 16:53:12 -0500 Subject: [PATCH 086/181] ENH+BF: unlink files before overriding, use __dup- prefix for the suffix --- bin/heudiconv | 13 ++++++++----- heuristics/dbic_bids.py | 4 ++-- heuristics/test_dbic_bids.py | 6 +++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 8572f541..981ecab5 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -556,8 +556,7 @@ def compress_dicoms(dicom_list, out_prefix): import time _old_time = time.time time.time = lambda: dcm_time - if exists(outtar): - # could be under annex and forbid inplace change + if os.path.lexists(outtar): os.unlink(outtar) with tarfile.open(outtar, 'w:gz', dereference=True) as tar: for filename in dicom_list: @@ -600,9 +599,13 @@ def safe_copyfile(src, dest): """ if os.path.isdir(dest): dest = os.path.join(dest, os.path.basename(src)) - if os.path.lexists(dest) and not global_options['overwrite']: - raise ValueError("was asked to copy %s but destination already exists: %s" - % (src, dest)) + if os.path.lexists(dest): + if not global_options['overwrite']: + raise ValueError("was asked to copy %s but destination already exists: %s" + % (src, dest)) + else: + # to make sure we can write there ... still fail if it is entire directory ;) + os.unlink(dest) shutil.copyfile(src, dest) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index dd6f2d6d..a181db53 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -362,7 +362,7 @@ def infotodict(seqinfo): lgr.warning("Could not figure out where to stick %d sequences: %s" % (len(skipped_unknown), skipped_unknown)) - info = get_dups_marked(info) # mark duplicate ones with __dup0x suffix + info = get_dups_marked(info) # mark duplicate ones with __dup-0x suffix info = dict(info) # convert to dict since outside functionality depends on it being a basic dict return info @@ -382,7 +382,7 @@ def get_dups_marked(info): # copy the duplicate ones into separate ones for dup_series_id in series_ids[:-1]: dup_id += 1 - dup_template = ('%s__dup%02d' % ( + dup_template = ('%s__dup-%02d' % ( template[0], dup_id),) + template[1:] # There must have not been such a beast before! assert dup_template not in info diff --git a/heuristics/test_dbic_bids.py b/heuristics/test_dbic_bids.py index 47d722f8..4d106914 100644 --- a/heuristics/test_dbic_bids.py +++ b/heuristics/test_dbic_bids.py @@ -15,11 +15,11 @@ def test_get_dups_marked(): (('smth2',), ['a', 'b', 'c']) ])) == \ { - ('bu__dup01', 'du'): [1], + ('bu__dup-01', 'du'): [1], ('bu', 'du'): [2], ('smth',): [3], - ('smth2__dup02',): ['a'], - ('smth2__dup03',): ['b'], + ('smth2__dup-02',): ['a'], + ('smth2__dup-03',): ['b'], ('smth2',): ['c'] } From af20de73f2a89a8c52142e11cc1d286a0748c90a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 10 Jan 2017 11:18:53 -0500 Subject: [PATCH 087/181] BF: one little heuristic to generally fix anat_T1w -> anat-T1w --- heuristics/dbic_bids.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index a181db53..d4b21744 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -480,9 +480,13 @@ def sanitize_str(value): def parse_dbic_protocol_name(protocol_name): - """Parse protocol name + """Parse protocol name according to our convention with minimal set of fixups """ + # Since Yarik didn't know better place to put it in, but could migrate outside + # at some point + protocol_name = protocol_name.replace("anat_T1w", "anat-T1w") + # Parse the name according to our convention # https://docs.google.com/document/d/1R54cgOe481oygYVZxI7NHrifDyFUZAjOBwCTu7M7y48/edit?usp=sharing # Remove possible suffix we don't care about after __ @@ -708,3 +712,11 @@ def test_parse_dbic_protocol_name(): 'seqtype_label': 'scout', 'session': '+', } + + assert pdpn("anat_T1w_acq-MPRAGE_run+") == \ + { + 'seqtype': 'anat', + 'run': '+', + 'acq': 'MPRAGE', + 'seqtype_label': 'T1w' + } From 0ed5a7172a7dfe7bedbe20103a09b7e729a77885 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 10 Jan 2017 16:55:37 -0500 Subject: [PATCH 088/181] ENH: some canceled runs for ontrack study + minor re-formatting --- heuristics/dbic_bids.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index d4b21744..8767c8ef 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -18,6 +18,9 @@ 'A000082': ['^5-'], 'A000088': ['^9-'], 'A000090': ['^5-'], + 'A000127': ['^21-'], + 'A000130': ['^15-'], + 'A000137': ['^9-', '^11-'], } # dictionary containing fixes, keys are md5sum of study_description from @@ -382,8 +385,9 @@ def get_dups_marked(info): # copy the duplicate ones into separate ones for dup_series_id in series_ids[:-1]: dup_id += 1 - dup_template = ('%s__dup-%02d' % ( - template[0], dup_id),) + template[1:] + dup_template = ( + '%s__dup-%02d' % (template[0], dup_id), + ) + template[1:] # There must have not been such a beast before! assert dup_template not in info info[dup_template] = [dup_series_id] From 823661f6e30b89c46bef38a92d067a11518934fc Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 10 Jan 2017 16:58:21 -0500 Subject: [PATCH 089/181] BF(TST): we are sanitizing values, so - gets removed as well atm --- heuristics/dbic_bids.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 8767c8ef..01ddab81 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -661,9 +661,9 @@ def test_fix_dbic_protocol(): def test_sanitize_str(): - assert sanitize_str('acq-super@duper.faster') == 'acq-superduperfaster' - assert sanitize_str('acq-perfect') == 'acq-perfect' - assert sanitize_str('acq-never:use:colon:!') == 'acq-neverusecolon' + assert sanitize_str('super@duper.faster') == 'superduperfaster' + assert sanitize_str('perfect') == 'perfect' + assert sanitize_str('never:use:colon:!') == 'neverusecolon' def test_fixupsubjectid(): From 320df078959cf4d0480a9456ebe86d3075c4372c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 10 Jan 2017 17:58:12 -0500 Subject: [PATCH 090/181] RF: make marked canceled runs handling independent of "fixes" --- heuristics/dbic_bids.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 01ddab81..5d3d288c 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -133,6 +133,8 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): """Function that adds cancelme_ to known bad runs which were forgotten""" accession_number = get_unique(seqinfo, 'accession_number') if accession_number in accession2run: + lgr.info("Considering some runs possibly marked to be " + "canceled for accession %s", accession_number) badruns = accession2run[accession_number] badruns_pattern = '|'.join(badruns) for i, s in enumerate(seqinfo): @@ -147,9 +149,6 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): """Ad-hoc fixup for existing protocols""" - # add cancelme to known bad runs - seqinfo = fix_canceled_runs(seqinfo) - # get name of the study to check if we know how to fix it up study_descr = get_unique(seqinfo, 'study_description') study_descr_hash = md5sum(study_descr) @@ -188,6 +187,10 @@ def infotodict(seqinfo): """ # XXX: ad hoc hack study_description = get_unique(seqinfo, 'study_description') + + # add cancelme to known bad runs + seqinfo = fix_canceled_runs(seqinfo) + if md5sum(study_description) in protocols2fix: lgr.info("Fixing up protocol for {0}".format(study_description)) seqinfo = fix_dbic_protocol(seqinfo) From c354bdb75ef00e85bc33259fee48f58a3584f6fb Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 31 Jan 2017 12:37:34 -0500 Subject: [PATCH 091/181] initial version of the singularity env definition file --- custom/dbic/README | 3 +++ custom/dbic/singularity-env.def | 44 +++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 custom/dbic/README create mode 100644 custom/dbic/singularity-env.def diff --git a/custom/dbic/README b/custom/dbic/README new file mode 100644 index 00000000..844e7446 --- /dev/null +++ b/custom/dbic/README @@ -0,0 +1,3 @@ +Scripts and configurations used alongside with heudiconv setup at DBIC +(Dartmouth Brain Imaging Center). Might migrate to an independent repository +eventually but for now placed here with hope they might come useful for some. diff --git a/custom/dbic/singularity-env.def b/custom/dbic/singularity-env.def new file mode 100644 index 00000000..fdb98fd3 --- /dev/null +++ b/custom/dbic/singularity-env.def @@ -0,0 +1,44 @@ +# Copyright (c) 2015-2016, Gregory M. Kurtzer. All rights reserved. +# +# Changes for NeuroDebian/DBIC setup are Copyright (c) 2017 Yaroslav Halchenko. +# +# The purpose of the environment is to provide a complete suite for running +# heudiconv on the INBOX server to provide conversion into BIDS layout. +# ATM it does not ship heudiconv itself which would be accessed directly +# from the main drive for now. +# +# "Singularity" Copyright (c) 2016, The Regents of the University of California, +# through Lawrence Berkeley National Laboratory (subject to receipt of any +# required approvals from the U.S. Dept. of Energy). All rights reserved. + +# +# Notes: +# - Due to https://github.com/singularityware/singularity/issues/471 +# bootstrapping leads to non-usable/non-removable-without-reboot +# image due to some rogue run away processes. +# This line could help to kill them but should be used with caution +# since could kill other unrelated processes +# +# grep -l loop /proc/*/mountinfo | sed -e 's,/proc/\(.*\)/.*,\1,g' | while read pid; do sudo kill $pid; done + +BootStrap: debootstrap +OSVersion: stable +MirrorURL: http://ftp.us.debian.org/debian/ + +# so if image is executed we just enter the environment +%runscript + echo "Welcome to the DBIC BIDS environment" + /bin/bash + + +%post + echo "Configuring the environment" + apt-get update + apt-get -y install vim eatmydata wget strace time + wget -q -O/tmp/nd-configurerepo https://raw.githubusercontent.com/neurodebian/neurodebian/4d26c8f30433145009aa3f74516da12f560a5a13/tools/nd-configurerepo + bash /tmp/nd-configurerepo + eatmydata apt-get -y install datalad python-nipype virtualenv dcm2niix + apt-get clean + # and wipe out apt lists since not to be used RW for further tuning + rm /var/lib/apt/lists/* + From 48be77117aa3e551c12d68df0af547002b189e42 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 7 Feb 2017 09:13:30 -0500 Subject: [PATCH 092/181] tune up singularity def to have /afs /inbox mount points and to not wipe out apt listings --- custom/dbic/singularity-env.def | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/custom/dbic/singularity-env.def b/custom/dbic/singularity-env.def index fdb98fd3..8fbcd8c2 100644 --- a/custom/dbic/singularity-env.def +++ b/custom/dbic/singularity-env.def @@ -34,11 +34,18 @@ MirrorURL: http://ftp.us.debian.org/debian/ %post echo "Configuring the environment" apt-get update - apt-get -y install vim eatmydata wget strace time + apt-get -y install vim eatmydata wget strace time ncdu wget -q -O/tmp/nd-configurerepo https://raw.githubusercontent.com/neurodebian/neurodebian/4d26c8f30433145009aa3f74516da12f560a5a13/tools/nd-configurerepo bash /tmp/nd-configurerepo + chmod a+r -R /etc/apt eatmydata apt-get -y install datalad python-nipype virtualenv dcm2niix apt-get clean + # and wipe out apt lists since not to be used RW for further tuning - rm /var/lib/apt/lists/* + # find /var/lib/apt/lists/ -type f -delete + # /usr/bin/find /var/lib/apt/lists/ -type f -name \*Packages\* -o -name \*Contents\* + # complicates later interrogation - thus disabled + + # Create some bind mount directories present on rolando + mkdir -p /afs /inbox From ff33235bf57000c6bd09ab15a69810aa290c2521 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 7 Feb 2017 10:40:28 -0500 Subject: [PATCH 093/181] this is aint a dataset --- .datalad/config | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 .datalad/config diff --git a/.datalad/config b/.datalad/config deleted file mode 100644 index 423a607f..00000000 --- a/.datalad/config +++ /dev/null @@ -1,2 +0,0 @@ -[datalad "dataset"] - id = b90e9412-9672-11e6-814e-8019340ce7f2 From a47fb0ad7ff2cc8405ad6a6c31f2d6c4861d1919 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 7 Feb 2017 22:00:28 -0500 Subject: [PATCH 094/181] ENH+RF: post-treat info file to remove abundant info, make json dump human-friendly --- bin/heudiconv | 121 ++++++++++++++++++++++++++++++++++++++++++--- tests/test_main.py | 10 +++- 2 files changed, 123 insertions(+), 8 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 981ecab5..74a0d3f0 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -31,6 +31,7 @@ import sys from tempfile import mkdtemp import tarfile +from copy import deepcopy from collections import namedtuple from collections import defaultdict from collections import OrderedDict as ordereddict @@ -146,6 +147,49 @@ def save_json(filename, data): fp.write(json_str.replace(' \n', '\n')) +def slim_down_info(j): + """Given an aggregated info structure, removes excessive details + + such as Csa fields, and SourceImageSequence which on Siemens files could be + huge and not providing any additional immediately usable information. + If needed, could be recovered from stored DICOMs + """ + j = deepcopy(j) # we will do in-place modification on a copy + dicts = [] + # poor man programming for now + if 'const' in j.get('global', {}): + dicts.append(j['global']['const']) + if 'samples' in j.get('time', {}): + dicts.append(j['time']['samples']) + for d in dicts: + for k in list(d.keys()): + if k.startswith('Csa') or k.lower() in {'sourceimagesequence'}: + del d[k] + return j + + +def json_dumps_pretty(j, indent=2, sort_keys=True): + """Given a json structure, pretty print it by colliding numeric arrays into a line + + If resultant structure differs from original -- throws exception + """ + import re + js = json.dumps(j, indent=indent, sort_keys=sort_keys) + # trim away \n and spaces between entries of numbers + js_ = re.sub( + '[\n ]+("?[-+.0-9e]+"?,?) *\n(?= *"?[-+.0-9e]+"?)', r' \1', + js, flags=re.MULTILINE) + # uniform no spaes before ] + js_ = re.sub(" *\]", "]", js_) + # uniform spacing before numbers + js_ = re.sub(' *("?[-+.0-9e]+"?)[ \n]*', r' \1', js_) + # no spaces after [ + js_ = re.sub('\[ ', '[', js_) + j_ = json.loads(js_) + assert(j == j_) + return js_ + + def load_json(filename): """Load data from a json file @@ -485,6 +529,29 @@ def conversion_info(subject, outdir, info, filegroup, ses=None): def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False): + """ + + If `niftifile` doesn't exist, it gets created out of the `dcmfiles` stack, + and json representation of its meta_ext is returned (bug since should return + both niftifile and infofile?) + + if `niftifile` exists, its affine's orientation information is used while + establishing new `NiftiImage` out of dicom stack and together with `bids_info` + (if provided) is dumped into json `infofile` + + Parameters + ---------- + dcmfiles + niftifile + infofile + bids_info + force + + Returns + ------- + niftifile, infofile + + """ import dcmstack as ds import nibabel as nb import os @@ -792,9 +859,12 @@ def convert(items, symlink=True, converter=None, "For now not embedding BIDS and info generated .nii.gz itself since sequence produced multiple files") else: #if not is_bids or converter != 'dcm2niix': ##uses dcm2niix's infofile - embed_metadata_into_nifti(converter, is_bids, item_dicoms, - outname, outname_bids, prov_file, - scaninfo, tmpdir, with_prov) + embed_metadata_from_dicoms(converter, is_bids, item_dicoms, + outname, outname_bids, prov_file, + scaninfo, tmpdir, with_prov) + if exists(scaninfo): + lgr.info("Post-treating %s file", scaninfo) + treat_infofile(scaninfo) os.chmod(outname, 0o0440) if custom_callable is not None: @@ -841,9 +911,28 @@ def tuneup_bids_json_files(json_files): # place them into phasediff -def embed_metadata_into_nifti(converter, is_bids, item_dicoms, outname, - outname_bids, prov_file, scaninfo, tmpdir, - with_prov): +def embed_metadata_from_dicoms(converter, is_bids, item_dicoms, outname, + outname_bids, prov_file, scaninfo, tmpdir, + with_prov): + """ + Enhance sidecar information file with more information from DICOMs + + Parameters + ---------- + converter + is_bids + item_dicoms + outname + outname_bids + prov_file + scaninfo + tmpdir + with_prov + + Returns + ------- + + """ from nipype import Node, Function embedfunc = Node(Function(input_names=['dcmfiles', 'niftifile', @@ -873,10 +962,28 @@ def embed_metadata_into_nifti(converter, is_bids, item_dicoms, outname, format='turtle') g.serialize(prov_file, format='turtle') os.chmod(prov_file, 0o0440) - except: + except Exception as exc: + lgr.error("Embedding failed: %s", str(exc)) os.chdir(cwd) +def treat_infofile(filename): + """Tune up generated .json file (slim down, pretty-print for humans). + + Was difficult to do within embed_nifti since it has no access to our functions + """ + with open(filename) as f: + j = json.load(f) + + j_slim = j # slim_down_info(j) + j_pretty = json_dumps_pretty(j_slim, indent=2, sort_keys=True) + + os.chmod(filename, 0o0660) + with open(filename, 'wt') as fp: + fp.write(j_pretty) + os.chmod(filename, 0o0440) + + def convert_dicoms(sid, dicoms, outdir, diff --git a/tests/test_main.py b/tests/test_main.py index 69a264cc..683909b1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -89,4 +89,12 @@ def test_prepare_for_datalad(tmpdir): assert set(ds.repo.get_indexed_files()) == target_files # and all are under git for f in target_files: - assert not ds.repo.is_under_annex(f) \ No newline at end of file + assert not ds.repo.is_under_annex(f) + +def test_json_dumps_pretty(): + pretty = heudiconv.json_dumps_pretty + assert pretty({}) == "{}" + assert pretty({"a": -1, "b": "123", "c": [1, 2, 3], "d": ["1.0", "2.0"]}) \ + == '{\n "a": -1, \n "b": "123", \n "c": [1, 2, 3], \n "d": ["1.0", "2.0"]\n}' + assert pretty({'a': ["0.3", "-1.9128906358217845e-12", "0.2"]}) \ + == '{\n "a": ["0.3", "-1.9128906358217845e-12", "0.2"]\n}' \ No newline at end of file From 1b8e71b162e1c6e132adc5f641d81f5c93c4ba05 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 7 Feb 2017 22:01:32 -0500 Subject: [PATCH 095/181] ENH: could work just with --command option for quick application (+enable json slimming, default to dcm2niix) --- bin/heudiconv | 23 ++++++++++++++++++----- heuristics/dbic_bids.py | 4 ++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 74a0d3f0..c1186473 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -679,7 +679,7 @@ def safe_copyfile(src, dest): def convert(items, symlink=True, converter=None, scaninfo_suffix='.json', custom_callable=None, with_prov=False, is_bids=False, sourcedir=None, outdir=None): - """Performs actual convertion (calls to converter etc) given info from + """Perform actual conversion (calls to converter etc) given info from heuristic's `infotodict` Parameters @@ -699,7 +699,7 @@ def convert(items, symlink=True, converter=None, None """ prov_files = [] - tmpdir = mkdtemp(prefix='heudiconvtmp') + tmpdir = mkdtemp(prefix='heudiconvdcm') for item_idx, item in enumerate(items): prefix, outtypes, item_dicoms = item[:3] if not isinstance(outtypes, (list, tuple)): @@ -975,7 +975,7 @@ def treat_infofile(filename): with open(filename) as f: j = json.load(f) - j_slim = j # slim_down_info(j) + j_slim = slim_down_info(j) j_pretty = json_dumps_pretty(j_slim, indent=2, sort_keys=True) os.chmod(filename, 0o0660) @@ -1126,7 +1126,7 @@ def get_extracted_dicoms(fl): # are unique, or at least in a unqiue subdir per session # strategy: extract everything in a temp dir and assemble a list # of all files in all tarballs - tmpdir = tempdirs(prefix='heudiconvtmp') + tmpdir = tempdirs(prefix='heudiconvdcm') sessions = defaultdict(list) session = 0 @@ -1474,6 +1474,15 @@ def _main(args): subjs = args.subjs outdir = os.path.abspath(args.outdir) + if args.command: + # custom mode of operation + if args.command == 'treat_json': + for f in files_opt: + treat_infofile(f) + else: + raise ValueError("Unknown command %s", args.command) + return + # TODO: Move into a function! study_sessions = get_study_sessions( dicom_dir_template, files_opt, @@ -1617,7 +1626,7 @@ def get_parser(): 'first be "sorted" and subject IDs deduced by the ' 'heuristic') parser.add_argument('-c', '--converter', dest='converter', - required=True, + default='dcm2niix', choices=( 'mri_convert', 'dcmstack', 'dcm2nii', 'dcm2niix', 'none'), @@ -1663,6 +1672,10 @@ def get_parser(): (i.e. no symlinks to under .git/annex). For now just for BIDS mode.''') parser.add_argument('--dbg', action='store_true', dest='debug', help="do not catch exceptions and show exception traceback") + parser.add_argument('--command', dest='command', + choices=('treat_json',), + help='''custom actions to be performed on provided files instead of + regular operation.''') parser.add_argument( 'files', nargs='*', diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 5d3d288c..9afbc160 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -105,6 +105,10 @@ def filter_files(fn): elif accession_number.startswith('phantom-'): # Accessions on phantoms, e.g. in dartmouth-phantoms/bids_test4-20161014 return True + elif accession_number.startswith('heudiconvdcm'): + # we were given some tarball with dicoms which was extracted so we + # better obey + return True else: return True if re.match('^[0-9]+-', sequence_dir) else False From 8e82315a5944be60c807967642d6972f5997bc0f Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 7 Feb 2017 22:13:42 -0500 Subject: [PATCH 096/181] BF: explicitly repr study_session_info for exception msg --- bin/heudiconv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index 981ecab5..4c8603c3 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1157,7 +1157,7 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, if study_session_info in study_sessions: raise ValueError( "We already have a study session with the same value %s" - % study_session_info) + % repr(study_session_info)) study_sessions[study_session_info] = seqinfo return study_sessions From 4e8a265586707fd3d7044a63855f4450eec7a5f9 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 7 Feb 2017 22:17:24 -0500 Subject: [PATCH 097/181] ENH: load heuristic only if not processing custom command --- bin/heudiconv | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index a2614a39..841fb15a 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1453,10 +1453,6 @@ def setup_exceptionhook(): def _main(args): """Given a structure of arguments from the parser perform computation""" - # - # Load heuristic -- better do it asap to make sure it loads correctly - # - heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) # # Deal with provided files or templates @@ -1483,6 +1479,11 @@ def _main(args): raise ValueError("Unknown command %s", args.command) return + # + # Load heuristic -- better do it asap to make sure it loads correctly + # + heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + # TODO: Move into a function! study_sessions = get_study_sessions( dicom_dir_template, files_opt, From c420d2149f0352579f997a7fdeaab87d0d43ea3f Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 9 Feb 2017 22:27:58 -0500 Subject: [PATCH 098/181] permissive permissions on created /afs and /inbox (is my umask hauting singularity again?) --- custom/dbic/singularity-env.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom/dbic/singularity-env.def b/custom/dbic/singularity-env.def index 8fbcd8c2..b0413290 100644 --- a/custom/dbic/singularity-env.def +++ b/custom/dbic/singularity-env.def @@ -48,4 +48,4 @@ MirrorURL: http://ftp.us.debian.org/debian/ # Create some bind mount directories present on rolando mkdir -p /afs /inbox - + chmod a+rX /afs /inbox From 982470c5c12ec2c7ee0ba50e8511d6f213d9508c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Feb 2017 13:28:19 -0500 Subject: [PATCH 099/181] skeleton record for future task file creation --- bin/heudiconv | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/bin/heudiconv b/bin/heudiconv index 841fb15a..276334d4 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -661,6 +661,20 @@ def get_dicom_series_time(dicom_list): return calendar.timegm(time.strptime(dicom_time_str, '%Y%m%d%H%M%S')) +def prep_task_file(taskname): + TODO = "TODO" + { + "RepetitionTime": 1.56, + "TaskName": taskname, + "EchoTime": TODO, + "FlipAngle": TODO, + "SliceTiming": TODO, + "Manufacturer": "Siemens", + "ManufacturerModelName": "TODO", + "MagneticFieldStrength": 3.0, + "CogAtlasID": "TODO" + } + def safe_copyfile(src, dest): """Copy file but blow if destination name already exists """ From b4592ee29da6d5733526cf22f2ffce255141418e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Feb 2017 13:35:58 -0500 Subject: [PATCH 100/181] respect session from cmdline if was provided and none was specified in study --- bin/heudiconv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index 276334d4..451d02d9 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1273,7 +1273,7 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, lgr.info("Study session for %s" % str(ids)) study_session_info = StudySessionInfo( ids.get('locator'), - ids.get('session', session), + ids.get('session', session) or session, ids.get('subject', None)) if study_session_info in study_sessions: raise ValueError( From 28e4d9ddb5c003677723e93719a2275fb02e7d66 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 15 Feb 2017 12:35:33 -0500 Subject: [PATCH 101/181] ls command to list information about what studies are available under each provided path --- bin/heudiconv | 11 ++++++++++- tests/test_heuristics.py | 15 ++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 451d02d9..fda55751 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1489,6 +1489,15 @@ def _main(args): if args.command == 'treat_json': for f in files_opt: treat_infofile(f) + elif args.command == 'ls': + heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + for f in files_opt: + study_sessions = get_study_sessions( + dicom_dir_template, [f], + heuristic, None, session, subjs) + print(f) + for study_session, sequences in study_sessions.items(): + print("\t%s %d sequences" % (str(study_session), len(sequences))) else: raise ValueError("Unknown command %s", args.command) return @@ -1688,7 +1697,7 @@ def get_parser(): parser.add_argument('--dbg', action='store_true', dest='debug', help="do not catch exceptions and show exception traceback") parser.add_argument('--command', dest='command', - choices=('treat_json',), + choices=('treat_json', 'ls'), help='''custom actions to be performed on provided files instead of regular operation.''') parser.add_argument( diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index b60ab56d..67cb4c7c 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -1,5 +1,9 @@ from . import heudiconv +import os +from mock import patch +from six.moves import StringIO + import pytest from datalad.api import Dataset @@ -34,4 +38,13 @@ def test_dbic_bids_largely_smoke(tmpdir): # and at the same commit assert ds.is_installed() assert not ds.repo.dirty - assert head == ds.repo.get_hexsha() \ No newline at end of file + assert head == ds.repo.get_hexsha() + + +@patch('sys.stdout', new_callable=StringIO) +def test_ls(stdout): + args = "-f heuristics/dbic_bids.py --command ls tests/data".split(' ') + heudiconv.main(args) + out = stdout.getvalue() + assert 'StudySessionInfo(locator=' in out + assert 'Halchenko/Yarik/950_bids_test4' in out \ No newline at end of file From 978b2ccbda927c9b99acbcee8ce77bb47cd4ca99 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 16 Feb 2017 18:33:37 -0500 Subject: [PATCH 102/181] minor: comment on a problem --- bin/heudiconv | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index fda55751..caa8fc16 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -179,7 +179,7 @@ def json_dumps_pretty(j, indent=2, sort_keys=True): js_ = re.sub( '[\n ]+("?[-+.0-9e]+"?,?) *\n(?= *"?[-+.0-9e]+"?)', r' \1', js, flags=re.MULTILINE) - # uniform no spaes before ] + # uniform no spaces before ] js_ = re.sub(" *\]", "]", js_) # uniform spacing before numbers js_ = re.sub(' *("?[-+.0-9e]+"?)[ \n]*', r' \1', js_) @@ -968,6 +968,23 @@ def embed_metadata_from_dicoms(converter, is_bids, item_dicoms, outname, embedfunc.base_dir = tmpdir cwd = os.getcwd() try: + """ + Ran into +INFO: Executing node embedder in dir: /tmp/heudiconvdcm2W3UQ7/embedder +ERROR: Embedding failed: [Errno 13] Permission denied: '/inbox/BIDS/tmp/test2-jessie/Wheatley/Beau/1007_personality/sub-sid000138/fmap/sub-sid000138_3mm_run-01_phasediff.json' +while +HEUDICONV_LOGLEVEL=WARNING time bin/heudiconv -f heuristics/dbic_bids.py -c dcm2niix -o /inbox/BIDS/tmp/test2-jessie --bids --datalad /inbox/DICOM/2017/01/28/A000203 + +so it seems that there is a filename collision so it tries to save into the same file name +and there was a screw up for that A + +/mnt/btrfs/dbic/inbox/DICOM/2017/01/28/A000203 + StudySessionInfo(locator='Wheatley/Beau/1007_personality', session=None, subject='sid000138') 16 sequences + StudySessionInfo(locator='Wheatley/Beau/1007_personality', session=None, subject='a000203') 2 sequences + + +in that one though + """ res = embedfunc.run() os.chmod(scaninfo, 0o0440) if with_prov: From 0809c8b07450e13ad9b9ac98641be005f9b96171 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 17 Feb 2017 16:46:34 -0500 Subject: [PATCH 103/181] ask for some DICOM fields allowing them not to be specified, allow study description to be empty (goes into unknown) was all made to robustify for "run against everything" --- bin/heudiconv | 18 ++++++++++-------- heuristics/dbic_bids.py | 15 ++++++++++----- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index fda55751..7ce7efbb 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -433,15 +433,15 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F 'MoCo' in dcminfo.SeriesDescription, # New ones by us 'derived' in [x.lower() for x in dcminfo.get('ImageType', [])], - dcminfo.PatientID, - dcminfo.StudyDescription, - dcminfo.ReferringPhysicianName, - dcminfo.SeriesDescription, + dcminfo.get('PatientID'), + dcminfo.get('StudyDescription'), + dcminfo.get('ReferringPhysicianName'), + dcminfo.get('SeriesDescription'), tuple(dcminfo.ImageType), - dcminfo.AccessionNumber, + dcminfo.get('AccessionNumber'), # For demographics to populate BIDS participants.tsv - dcminfo.PatientsAge, - dcminfo.PatientsSex, + dcminfo.get('PatientsAge'), + dcminfo.get('PatientsSex'), ) # candidates # dcminfo.AccessionNumber @@ -1276,9 +1276,11 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, ids.get('session', session) or session, ids.get('subject', None)) if study_session_info in study_sessions: - raise ValueError( + #raise ValueError( + lgr.warning( "We already have a study session with the same value %s" % repr(study_session_info)) + continue # skip for now study_sessions[study_session_info] = seqinfo return study_sessions diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 9afbc160..56f4c01c 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -129,6 +129,8 @@ def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), def md5sum(string): """Computes md5sum of as string""" + if not string: + return "" # not None so None was not compared to strings m = hashlib.md5(string.encode()) return m.hexdigest() @@ -425,12 +427,15 @@ def infotoids(seqinfos, outdir): study_description_hash = md5sum(study_description) subject = fixup_subjectid(get_unique(seqinfos, 'patient_id')) # TODO: fix up subject id if missing some 0s - split = study_description.split('^', 1) - # split first one even more, since couldbe PI_Student or PI-Student - split = re.split('-|_', split[0], 1) + split[1:] + if study_description: + split = study_description.split('^', 1) + # split first one even more, since couldbe PI_Student or PI-Student + split = re.split('-|_', split[0], 1) + split[1:] - # locator = study_description.replace('^', '/') - locator = '/'.join(split) + # locator = study_description.replace('^', '/') + locator = '/'.join(split) + else: + locator = 'unknown' # TODO: actually check if given study is study we would care about # and if not -- we should throw some ???? exception From f86568aac6925b4bff093f954ce8e748633496c2 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 17 Feb 2017 16:52:17 -0500 Subject: [PATCH 104/181] (re)Populate task and events files --- bin/heudiconv | 75 +++++++++++++++++++++++++++++------------ heuristics/dbic_bids.py | 11 ++++++ tests/test_main.py | 5 ++- 3 files changed, 68 insertions(+), 23 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index caa8fc16..4ac6b8e4 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -661,20 +661,6 @@ def get_dicom_series_time(dicom_list): return calendar.timegm(time.strptime(dicom_time_str, '%Y%m%d%H%M%S')) -def prep_task_file(taskname): - TODO = "TODO" - { - "RepetitionTime": 1.56, - "TaskName": taskname, - "EchoTime": TODO, - "FlipAngle": TODO, - "SliceTiming": TODO, - "Manufacturer": "Siemens", - "ManufacturerModelName": "TODO", - "MagneticFieldStrength": 3.0, - "CogAtlasID": "TODO" - } - def safe_copyfile(src, dest): """Copy file but blow if destination name already exists """ @@ -1129,7 +1115,10 @@ def convert_dicoms(sid, seqinfo.keys()[0].patient_age, seqinfo.keys()[0].patient_sex, ) - populate_bids_templates(anon_outdir) + populate_bids_templates( + anon_outdir, + getattr(heuristic, 'DEFAULT_FIELDS', {}) + ) def get_annonimized_sid(sid, anon_sid_cmd): @@ -1320,17 +1309,18 @@ def create_file_if_missing(filename, content): return True -def populate_bids_templates(path): +def populate_bids_templates(path, defaults={}): # dataset descriptor + lgr.info("Populating template files under %s", path) descriptor = opj(path, 'dataset_description.json') - if not exists(descriptor): + if True: # not exists(descriptor): save_json(descriptor, ordereddict([ ('Name', "TODO: name of the dataset"), ('BIDSVersion', "1.0.1"), - ('License', "TODO: choose a license, e.g. PDDL (http://opendatacommons.org/licenses/pddl/)"), - ('Authors', ["TODO:", "First1 Last1", "First2 Last2", "..."]), - ('Acknowledgements', "We thank Terry Sacket and the rest of the DBIC (Dartmouth Brain Imaging Center) personnel for assistance in data collection. TODO: more"), + ('License', defaults.get('License', "TODO: choose a license, e.g. PDDL (http://opendatacommons.org/licenses/pddl/)")), + ('Authors', defaults.get('Authors', ["TODO:", "First1 Last1", "First2 Last2", "..."])), + ('Acknowledgements', defaults.get('Acknowledgements', 'TODO: whom you want to acknowledge')), ('HowToAcknowledge', "TODO: describe how to acknowledge -- either cite a corresponding paper, or just in acknowledgement section"), ('Funding', ["TODO", "GRANT #1", "GRANT #2"]), ('ReferencesAndLinks', ["TODO", "List of papers or websites"]), @@ -1368,6 +1358,40 @@ TODO: Provide description for the dataset -- basic details about the study, possibly pointing to pre-registration (if public or embargoed) """) + # TODO: collect all task- .json files for func files to + tasks = {} + # way too many -- let's just collect all which are the same! + # FIELDS_TO_TRACK = {'RepetitionTime', 'FlipAngle', 'EchoTime', 'Manufacturer', 'SliceTiming', ''} + for fpath in find_files('.*_task-.*\_bold\.json', topdir=path, + exclude_vcs=True, exclude="/\.(datalad|heudiconv)/"): + task = re.sub('.*_(task-[^_\.]*(_acq-[^_\.]*)?)_.*', r'\1', fpath) + j = load_json(fpath) + if task not in tasks: + tasks[task] = j + else: + rec = tasks[task] + # let's retain only those fields which have the same value + for field in sorted(rec): + if field not in j or j[field] != rec[field]: + del rec[field] + + # create a stub onsets file for each one of those + suf = '_bold.json' + assert fpath.endswith(suf) + events_file = fpath[:-len(suf)] + '_events.tsv' + lgr.debug("Generating %s", events_file) + with open(events_file, 'w') as f: + f.write("onset\tduration\ttrial_type\tTODO -- fill in rows and add more tab-separated columns if desired") + + # - extract tasks files stubs + for task_acq, fields in tasks.items(): + task_file = opj(path, task_acq + '_bold.json') + lgr.debug("Generating %s", task_file) + fields["TaskName"] = "TODO: full task name for %s" % task_acq.split('_')[0].split('-')[1] + fields["CogAtlasID"] = "TODO" + with open(task_file, 'w') as f: + f.write(json_dumps_pretty(fields, indent=2, sort_keys=True)) + def add_participant_record(studydir, subject, age, sex): participants_tsv = opj(studydir, 'participants.tsv') @@ -1503,7 +1527,7 @@ def _main(args): if args.command: # custom mode of operation - if args.command == 'treat_json': + if args.command == 'treat-json': for f in files_opt: treat_infofile(f) elif args.command == 'ls': @@ -1515,6 +1539,13 @@ def _main(args): print(f) for study_session, sequences in study_sessions.items(): print("\t%s %d sequences" % (str(study_session), len(sequences))) + elif args.command == 'populate-templates': + heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + for f in files_opt: + populate_bids_templates( + f, + getattr(heuristic, 'DEFAULT_FIELDS', {}) + ) else: raise ValueError("Unknown command %s", args.command) return @@ -1714,7 +1745,7 @@ def get_parser(): parser.add_argument('--dbg', action='store_true', dest='debug', help="do not catch exceptions and show exception traceback") parser.add_argument('--command', dest='command', - choices=('treat_json', 'ls'), + choices=('treat-json', 'ls', 'populate-templates'), help='''custom actions to be performed on provided files instead of regular operation.''') parser.add_argument( diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 9afbc160..bbe84955 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -72,6 +72,17 @@ '1.3.12.2.1107.5.2.43.66112.30000016102813152550600000004', # double scout ] +DEFAULT_FIELDS = { + # Let it just be in each json file extracted + #'Manufacturer': "Siemens", + #'ManufacturersModelName': "Prisma", + "Acknowledgements": + "We thank Terry Sacket and the rest of the DBIC (Dartmouth Brain Imaging " + "Center) personnel for assistance in data collection, and " + "Yaroslav Halchenko and Matteo Visconti for preparing BIDS dataset. " + "TODO: more", +} + def filter_dicom(dcmdata): """Return True if a DICOM dataset should be filtered out, else False""" diff --git a/tests/test_main.py b/tests/test_main.py index 683909b1..82da08a4 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -36,10 +36,13 @@ def test_create_file_if_missing(tmpdir): def test_populate_bids_templates(tmpdir): - heudiconv.populate_bids_templates(str(tmpdir)) + heudiconv.populate_bids_templates( + str(tmpdir), + defaults={'Acknowledgements': 'something'}) for f in "README", "dataset_description.json", "CHANGES": # Just test that we have created them and they all have stuff TODO assert "TODO" in tmpdir.join(f).read() + assert "something" in tmpdir.join('dataset_description.json').read() def test_add_participant_record(tmpdir): From d4b4b190b549d934e23a63cbe7642478629302cf Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 17 Feb 2017 17:00:10 -0500 Subject: [PATCH 105/181] ENH: just to avoid all the possible whining from TempDir ;-) --- bin/heudiconv | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 68838798..d4db0168 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -99,8 +99,10 @@ StudySessionInfo = namedtuple( class TempDirs(object): """A helper to centralize handling and cleanup of dirs""" + def __init__(self): self.dirs = [] + self.exists = os.path.exists def __call__(self, prefix=None): tmpdir = mkdtemp(prefix=prefix) @@ -108,7 +110,11 @@ class TempDirs(object): return tmpdir def __del__(self): - self.cleanup() + try: + self.cleanup() + except AttributeError: + # we are too late to the show + pass def cleanup(self): lgr.debug("Removing %d temporary directories", len(self.dirs)) @@ -116,9 +122,10 @@ class TempDirs(object): lgr.debug("Removing %s", t) if self: self.rmtree(t) + self.dirs = [] def rmtree(self, tmpdir): - if os.path.exists(tmpdir): + if self.exists(tmpdir): shutil.rmtree(tmpdir) if tmpdir in self.dirs: self.dirs.remove(tmpdir) From 5f8fe455570a6984828ef6e89cdb5c34dfc96b44 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 21 Feb 2017 15:41:47 -0500 Subject: [PATCH 106/181] skip one run in A000297 (was canceled), include study hash in ls output --- bin/heudiconv | 9 ++++++++- heuristics/dbic_bids.py | 10 ++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index d4db0168..67fefb4d 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1541,13 +1541,20 @@ def _main(args): treat_infofile(f) elif args.command == 'ls': heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + heuristic_ls = getattr(heuristic, 'ls', None) for f in files_opt: study_sessions = get_study_sessions( dicom_dir_template, [f], heuristic, None, session, subjs) print(f) for study_session, sequences in study_sessions.items(): - print("\t%s %d sequences" % (str(study_session), len(sequences))) + suf = '' + if heuristic_ls: + suf += heuristic_ls(study_session, sequences) + print( + "\t%s %d sequences%s" + % (str(study_session), len(sequences), suf) + ) elif args.command == 'populate-templates': heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) for f in files_opt: diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index e978f9ca..11d137a8 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -21,6 +21,7 @@ 'A000127': ['^21-'], 'A000130': ['^15-'], 'A000137': ['^9-', '^11-'], + 'A000297': ['^12-'], } # dictionary containing fixes, keys are md5sum of study_description from @@ -102,6 +103,7 @@ def filter_files(fn): sequence_dir = split2[1] split3 = os.path.split(split2[0]) accession_number = split3[1] + return True if accession_number == 'A000043': # crazy one that got copied for some runs but not for others, # so we are going to discard those that got copied and let heudiconv @@ -188,6 +190,14 @@ def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): return seqinfo +def ls(study_session, seqinfo): + """Additional ls output for a seqinfo""" + #assert len(sequences) <= 1 # expecting only a single study here + #seqinfo = sequences.keys()[0] + study_descr = get_unique(seqinfo, 'study_description') + study_descr_hash = md5sum(study_descr) + return ' study hash: %s' % study_descr_hash + # XXX we killed session indicator! what should we do now?!!! # WE DON:T NEED IT -- it will be provided into conversion_info as `session` # So we just need subdir and file_suffix! From a5dcfca58ae861a52f1e8c322ff5fa31b452a752 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Tue, 21 Feb 2017 15:43:14 -0500 Subject: [PATCH 107/181] allow for empty age (N/A), skip studies with unknown locator, skip run for A000297 --- bin/heudiconv | 6 +++++- heuristics/dbic_bids.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index d4db0168..c00e3ada 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1422,7 +1422,7 @@ def add_participant_record(studydir, subject, age, sex): with open(participants_tsv, 'a') as f: f.write('\t'.join(map(str, [ participant_id, - age.lstrip('0').rstrip('Y'), + age.lstrip('0').rstrip('Y') if age else 'N/A', sex, 'control'])) + '\n') @@ -1588,6 +1588,10 @@ def _main(args): else: dicoms = files_or_seqinfo seqinfo = None + + if locator == 'unknown': + lgr.warning("Skipping unknown locator dataset") + continue if args.queue: if seqinfo and not dicoms: diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index e978f9ca..6483a139 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -21,6 +21,7 @@ 'A000127': ['^21-'], 'A000130': ['^15-'], 'A000137': ['^9-', '^11-'], + 'A000297': ['^12-'], } # dictionary containing fixes, keys are md5sum of study_description from From b26cdfe065192646d99589ef1bcc8bbc0e8e33eb Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 27 Feb 2017 15:21:08 -0500 Subject: [PATCH 108/181] singularity def - deploy bids-validator etc --- custom/dbic/singularity-env.def | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/custom/dbic/singularity-env.def b/custom/dbic/singularity-env.def index b0413290..fc83a1f5 100644 --- a/custom/dbic/singularity-env.def +++ b/custom/dbic/singularity-env.def @@ -22,7 +22,10 @@ # grep -l loop /proc/*/mountinfo | sed -e 's,/proc/\(.*\)/.*,\1,g' | while read pid; do sudo kill $pid; done BootStrap: debootstrap -OSVersion: stable +#OSVersion: stable +# needs nipype 0.12.1 but that one didn't build for stable since needs python-prov... +# so trying stretch +OSVersion: stretch MirrorURL: http://ftp.us.debian.org/debian/ # so if image is executed we just enter the environment @@ -34,11 +37,21 @@ MirrorURL: http://ftp.us.debian.org/debian/ %post echo "Configuring the environment" apt-get update - apt-get -y install vim eatmydata wget strace time ncdu + apt-get -y install eatmydata + eatmydata apt-get -y install vim wget strace time ncdu gnupg curl procps wget -q -O/tmp/nd-configurerepo https://raw.githubusercontent.com/neurodebian/neurodebian/4d26c8f30433145009aa3f74516da12f560a5a13/tools/nd-configurerepo bash /tmp/nd-configurerepo chmod a+r -R /etc/apt - eatmydata apt-get -y install datalad python-nipype virtualenv dcm2niix + eatmydata apt-get -y install datalad python-nipype virtualenv dcm2niix python-dcmstack python-configparser python-funcsigs python-pytest + + # for bids-validator + curl -sL https://deb.nodesource.com/setup_6.x | bash - && \ + eatmydata apt-get install -y nodejs + npm install -g bids-validator@0.20.0 + chmod a+rX -R /usr/lib/node_modules/ + + chmod a+rX -R /etc/apt/sources.list.d + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* apt-get clean # and wipe out apt lists since not to be used RW for further tuning From b12290187f35bec717cf3b008a41680b85d2639f Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 2 Mar 2017 11:21:31 -0500 Subject: [PATCH 109/181] Convert tabs to spaces --- heuristics/dbic_bids.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 11d137a8..36a569a2 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -449,12 +449,12 @@ def infotoids(seqinfos, outdir): subject = fixup_subjectid(get_unique(seqinfos, 'patient_id')) # TODO: fix up subject id if missing some 0s if study_description: - split = study_description.split('^', 1) - # split first one even more, since couldbe PI_Student or PI-Student - split = re.split('-|_', split[0], 1) + split[1:] + split = study_description.split('^', 1) + # split first one even more, since couldbe PI_Student or PI-Student + split = re.split('-|_', split[0], 1) + split[1:] - # locator = study_description.replace('^', '/') - locator = '/'.join(split) + # locator = study_description.replace('^', '/') + locator = '/'.join(split) else: locator = 'unknown' From 173d4cad621a3b095f937d48ac80d68df69d7aed Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 2 Mar 2017 12:04:29 -0500 Subject: [PATCH 110/181] Allow to group according to accession number instead studyuid --- bin/heudiconv | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 8ac3afd9..f33a9f10 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -262,7 +262,8 @@ def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False) find_files.__doc__ %= (_VCS_REGEX,) -def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=False): +def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=False, + per_accession_number=False): """Process list of dicoms and return seqinfo and file group `seqinfo` contains per-sequence extract of fields from DICOMs which @@ -281,6 +282,8 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F per_studyUID : bool, optional Then would add a StudyInstanceUID into study id. So it would not then generalize across re-runs on new data. + per_accession_number : bool, optional + If True, group according to accession number instead of study id Returns ------- @@ -413,6 +416,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F if per_studyUID: studyUID = series_id[2] series_id = series_id[:2] + accession_number = dcminfo.get('AccessionNumber') series_id = '-'.join(map(str, series_id)) @@ -445,7 +449,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F dcminfo.get('ReferringPhysicianName'), dcminfo.get('SeriesDescription'), tuple(dcminfo.ImageType), - dcminfo.get('AccessionNumber'), + accession_number, # For demographics to populate BIDS participants.tsv dcminfo.get('PatientsAge'), dcminfo.get('PatientsSex'), @@ -468,12 +472,19 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F if studyUID not in seqinfo: seqinfo[studyUID] = ordereddict() seqinfo[studyUID][info] = files + elif per_accession_number: + if accession_number not in seqinfo: + seqinfo[accession_number] = ordereddict() + seqinfo[accession_number][info] = files else: seqinfo[info] = files if per_studyUID: lgr.info("Generated sequence info for %d studies with %d entries total", len(seqinfo), sum(map(len, seqinfo.values()))) + elif per_accession_number: + lgr.info("Generated sequence info for %d accession numbers with %d entries total", + len(seqinfo), sum(map(len, seqinfo.values()))) else: lgr.info("Generated sequence info with %d entries", len(seqinfo)) return seqinfo @@ -1265,7 +1276,8 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, files_, flfilter=getattr(heuristic, 'filter_files', None), dcmfilter=getattr(heuristic, 'filter_dicom', None), - per_studyUID=True) + #per_studyUID=True, + per_accession_number=True) if not getattr(heuristic, 'infotoids', None): raise NotImplementedError( From 369c94382b39c1cb099bc0b66cd56d61b09e7e57 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 2 Mar 2017 12:16:56 -0500 Subject: [PATCH 111/181] Give more debug information --- bin/heudiconv | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index f33a9f10..3f8d4545 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -459,7 +459,14 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F # len(dcminfo.ReferencedImageSequence) # len(dcminfo.SourceImageSequence) # FOR demographics - lgr.debug("%30s %27s %27s %5s nref=%-2d nsrc=%-2d %s" % ( + if per_studyUID: + key = studyUID.split('.')[-1] + elif per_accession_number: + key = accession_number + else: + key = '' + lgr.debug("%30s %30s %27s %27s %5s nref=%-2d nsrc=%-2d %s" % ( + key, info.series_id, dcminfo.SeriesDescription, dcminfo.ProtocolName, From c14ca8cc5f477e73dae23f26907da3fc2f1d9e7b Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 2 Mar 2017 15:26:22 -0500 Subject: [PATCH 112/181] Get only unique session markers if we have common ones --- heuristics/dbic_bids.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 36a569a2..3511f9f7 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -483,6 +483,8 @@ def infotoids(seqinfos, outdir): " We will process until the end of the first session" ) if nonsign_vals: + # get only unique values + ses_markers = list(set(ses_markers)) if set(ses_markers).intersection('+='): raise NotImplementedError( "Should not mix hardcoded session markers with incremental ones (+=)" From 7c97196bb6db1c04dabc1919984d6ee6f06fde48 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Thu, 2 Mar 2017 17:42:15 -0500 Subject: [PATCH 113/181] Add option for grouping by studyUID or accession_number --- bin/heudiconv | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 3f8d4545..092702b1 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -262,8 +262,7 @@ def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False) find_files.__doc__ %= (_VCS_REGEX,) -def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=False, - per_accession_number=False): +def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='studyUID'): """Process list of dicoms and return seqinfo and file group `seqinfo` contains per-sequence extract of fields from DICOMs which @@ -279,11 +278,8 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F dcmfilter : callable, optional If called on dcm_data and returns True, it is used to set series_id - per_studyUID : bool, optional - Then would add a StudyInstanceUID into study id. So it would not then - generalize across re-runs on new data. - per_accession_number : bool, optional - If True, group according to accession number instead of study id + grouping : str ('studyUID', 'accession_number') + what to group by: studyUID or accession_number Returns ------- @@ -293,6 +289,11 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F filegrp : dict `filegrp` is a dictionary with files groupped per each sequence """ + allowed_groupings = ['studyUID', 'accession_number'] + if grouping not in allowed_groupings: + raise ValueError('I do not know how to group by {0}'.format(grouping)) + per_studyUID = grouping == 'studyUID' + per_accession_number = grouping == 'accession_number' lgr.info("Analyzing %d dicoms", len(fl)) import dcmstack as ds import dicom as dcm @@ -338,7 +339,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F # verify that we are working with a single study if studyUID is None: studyUID = studyUID_ - else: + elif not per_accession_number: assert studyUID == studyUID_ except AttributeError as exc: lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', @@ -1220,7 +1221,7 @@ def load_heuristic(heuristic_file): def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, - session, sids): + session, sids, grouping='studyUID'): """Given options from cmdline sort files or dicom seqinfos into study_sessions which put together files for a single session of a subject in a study @@ -1283,8 +1284,7 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, files_, flfilter=getattr(heuristic, 'filter_files', None), dcmfilter=getattr(heuristic, 'filter_dicom', None), - #per_studyUID=True, - per_accession_number=True) + grouping=grouping) if not getattr(heuristic, 'infotoids', None): raise NotImplementedError( @@ -1552,6 +1552,7 @@ def _main(args): session = args.session subjs = args.subjs outdir = os.path.abspath(args.outdir) + grouping = args.grouping if args.command: # custom mode of operation @@ -1564,7 +1565,7 @@ def _main(args): for f in files_opt: study_sessions = get_study_sessions( dicom_dir_template, [f], - heuristic, None, session, subjs) + heuristic, None, session, subjs, grouping=grouping) print(f) for study_session, sequences in study_sessions.items(): suf = '' @@ -1593,7 +1594,8 @@ def _main(args): # TODO: Move into a function! study_sessions = get_study_sessions( dicom_dir_template, files_opt, - heuristic, outdir, session, subjs) + heuristic, outdir, session, subjs, + grouping=grouping) # extract tarballs, and replace their entries with expanded lists of files # TODO: we might need to sort so sessions are ordered??? lgr.info("Need to process %d study sessions", len(study_sessions)) @@ -1787,6 +1789,10 @@ def get_parser(): choices=('treat-json', 'ls', 'populate-templates'), help='''custom actions to be performed on provided files instead of regular operation.''') + parser.add_argument('-g', '--grouping', + default='studyUID', + choices=('studyUID', 'accession_number'), + help='''How to group dicoms (default: by studyUID)''') parser.add_argument( 'files', nargs='*', From fb9c62e8e9b911ecfd9131f86474d910bedc0e0f Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 2 Mar 2017 18:56:19 -0500 Subject: [PATCH 114/181] BF+ENH: some tabs, do .format on target protocol_name,description, renamings for QA series --- bin/heudiconv | 2 ++ heuristics/dbic_bids.py | 30 ++++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 8ac3afd9..80148dca 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -78,6 +78,7 @@ SeqInfo = namedtuple( 'accession_number', 'patient_age', 'patient_sex', + 'date' ] ) @@ -449,6 +450,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, per_studyUID=F # For demographics to populate BIDS participants.tsv dcminfo.get('PatientsAge'), dcminfo.get('PatientsSex'), + dcminfo.get('AcquisitionDate'), ) # candidates # dcminfo.AccessionNumber diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 11d137a8..56ce1189 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -28,6 +28,19 @@ # dicoms, in the form of PI-Experimenter^protocolname # values are list of tuples in the form (regex_pattern, substitution) protocols2fix = { + # QA + '43b67d9139e8c7274578b7451ab21123': + [ + #('anat-scout.*', 'anat-scout_ses-{date}'), + ('anat-scout.*', 'anat-scout'), + ('BOLD_p2_s4(_3\.5mm)?', 'func_run+_task-rest_acq-p2s4'), + ('BOLD_p2', 'func_run+_task-rest_acq-p2'), + ('BOLD_', 'func_run+_task-rest'), + ('DTI_30_p2_s4(_3\.5mm)?', 'dwi_run+_acq-30p2s4'), + ('DTI_30_p2', 'dwi_run+_acq-30p2'), + ('_p2_s4(_3\.5mm)?', '_acq-p2s4'), + ('_p2', '_acq-p2'), + ], '9d148e2a05f782273f6343507733309d': [('anat_', 'anat-'), ('run-life[0-9]', 'run+_task-life'), @@ -237,6 +250,11 @@ def infotodict(seqinfo): skipped.append(s.series_id) lgr.debug("Ignoring derived data %s", s.series_id) continue + + # possibly apply present formatting in the series_description or protocol name + for f in 'series_description', 'protocol_name': + s = s._replace(**{f: getattr(s, f).format(**s._asdict())}) + template = None suffix = '' seq = [] @@ -379,7 +397,7 @@ def infotodict(seqinfo): # some are ok to skip and not to whine if "_Scout" in s.series_description or \ - (seqtype == 'anat' and seqtype_label == 'scout'): + (seqtype == 'anat' and seqtype_label.startswith('scout')): skipped.append(s.series_id) lgr.debug("Ignoring %s", s.series_id) else: @@ -449,12 +467,12 @@ def infotoids(seqinfos, outdir): subject = fixup_subjectid(get_unique(seqinfos, 'patient_id')) # TODO: fix up subject id if missing some 0s if study_description: - split = study_description.split('^', 1) - # split first one even more, since couldbe PI_Student or PI-Student - split = re.split('-|_', split[0], 1) + split[1:] + split = study_description.split('^', 1) + # split first one even more, since couldbe PI_Student or PI-Student + split = re.split('-|_', split[0], 1) + split[1:] - # locator = study_description.replace('^', '/') - locator = '/'.join(split) + # locator = study_description.replace('^', '/') + locator = '/'.join(split) else: locator = 'unknown' From d566746deb2bc4b1541410baf55d174f91de8bbf Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Thu, 2 Mar 2017 19:22:47 -0500 Subject: [PATCH 115/181] ENH: rename hardi_64 in all studies --- heuristics/dbic_bids.py | 1 + 1 file changed, 1 insertion(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 3511f9f7..d6feb9fe 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -525,6 +525,7 @@ def parse_dbic_protocol_name(protocol_name): # Since Yarik didn't know better place to put it in, but could migrate outside # at some point protocol_name = protocol_name.replace("anat_T1w", "anat-T1w") + protocol_name = protocol_name.replace("hardi_64", "dwi_acq-hardi64") # Parse the name according to our convention # https://docs.google.com/document/d/1R54cgOe481oygYVZxI7NHrifDyFUZAjOBwCTu7M7y48/edit?usp=sharing From 3663cee02b91d82c9ba07345096722f127e141d7 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Thu, 2 Mar 2017 19:24:21 -0500 Subject: [PATCH 116/181] ENH: dbic bids validator config to skip .heudiconv and .datalad labs --- heuristics/dbic_bids_validator.cfg | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 heuristics/dbic_bids_validator.cfg diff --git a/heuristics/dbic_bids_validator.cfg b/heuristics/dbic_bids_validator.cfg new file mode 100644 index 00000000..bd14f996 --- /dev/null +++ b/heuristics/dbic_bids_validator.cfg @@ -0,0 +1,7 @@ +{ + "ignore": [], + "warn": [], + "error": [], + "ignoredFiles": ["/.heudiconv/*", "/.heudiconv/*/*", "/.heudiconv/*/*/*", "/.git*", "/.datalad/*"] +} + From 1927f56e00698e629f1098ceec9756a58d6bf6f3 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Thu, 2 Mar 2017 19:26:04 -0500 Subject: [PATCH 117/181] BF: avoid growing number of annex thining + chmod file before writing --- bin/heudiconv | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index 092702b1..8cd10fc7 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -997,6 +997,10 @@ and there was a screw up for that A in that one though """ + if global_options['overwrite'] and lexists(scaninfo): + # TODO: handle annexed file case + if not os.path.islink(scaninfo): + os.chmod(scaninfo, 0o0660) res = embedfunc.run() os.chmod(scaninfo, 0o0440) if with_prov: @@ -1470,6 +1474,7 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): force=True, if_dirty='ignore', # see https://github.com/datalad/datalad/issues/1016 no_annex=True, # need to add .gitattributes first anyways + shared_access='all', annex_version=6) assert ds == ds_ assert ds.is_installed() @@ -1486,7 +1491,8 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): *.tgz annex.largefiles=(largerthan=0kb) """) # so for mortals it just looks like a regular directory! - ds.config.add('annex.thin', 'true', where='local') + if not ds.config.get('annex.thin'): + ds.config.add('annex.thin', 'true', where='local') # initialize annex there if not yet initialized AnnexRepo(ds.path, init=True) # Let's make it a From a2ad180dbef257e3e1ae2a8e3c28c05ea45750a3 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 2 Mar 2017 20:30:01 -0500 Subject: [PATCH 118/181] ENH: "lower" effective nipype logging to WARN in our INFO level, use our handler for datalad --- bin/heudiconv | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/bin/heudiconv b/bin/heudiconv index 502a7927..5496d52c 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -52,6 +52,23 @@ logging.basicConfig( ) lgr.debug("Starting the abomination") # just to "run-test" logging +# for nipype, we would like to lower level unless we are at debug level +try: + import nipype + for l in nipype.logging.loggers.values(): + if lgr.getEffectiveLevel() > 10: + l.setLevel(logging.WARN) +except ImportError: + pass + +try: + # Set datalad's logger to our handler + import datalad + datalad_lgr = logging.getLogger('datalad') + datalad_lgr.handlers = lgr.handlers +except ImportError: + pass + global_options = { 'overwrite': False # overwrite existing files } From 3feb6d49ebb2cdadb85ecc3f5c2fb0969fc3c01d Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Sat, 4 Mar 2017 10:02:46 -0500 Subject: [PATCH 119/181] Check that seqtype_label exists, ignore some files for validator --- bin/heudiconv | 35 ++++++++++++++++-------------- heuristics/dbic_bids.py | 3 ++- heuristics/dbic_bids_validator.cfg | 2 +- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 5496d52c..861e8991 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -52,22 +52,25 @@ logging.basicConfig( ) lgr.debug("Starting the abomination") # just to "run-test" logging -# for nipype, we would like to lower level unless we are at debug level -try: - import nipype - for l in nipype.logging.loggers.values(): - if lgr.getEffectiveLevel() > 10: - l.setLevel(logging.WARN) -except ImportError: - pass - -try: - # Set datalad's logger to our handler - import datalad - datalad_lgr = logging.getLogger('datalad') - datalad_lgr.handlers = lgr.handlers -except ImportError: - pass +# bloody git screws things up +# https://github.com/gitpython-developers/GitPython/issues/600 +# +## for nipype, we would like to lower level unless we are at debug level +#try: +# import nipype +# for l in nipype.logging.loggers.values(): +# if lgr.getEffectiveLevel() > 10: +# l.setLevel(logging.WARN) +#except ImportError: +# pass +# +#try: +# # Set datalad's logger to our handler +# import datalad +# datalad_lgr = logging.getLogger('datalad') +# datalad_lgr.handlers = lgr.handlers +#except ImportError: +# pass global_options = { 'overwrite': False # overwrite existing files diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 0bf7da47..8b2d1cb1 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -34,6 +34,7 @@ #('anat-scout.*', 'anat-scout_ses-{date}'), ('anat-scout.*', 'anat-scout'), ('BOLD_p2_s4(_3\.5mm)?', 'func_run+_task-rest_acq-p2s4'), + ('BOLD_p2_noprescannormalize', 'func_run+_task-rest_acq-p2noprescannormalize'), ('BOLD_p2', 'func_run+_task-rest_acq-p2'), ('BOLD_', 'func_run+_task-rest'), ('DTI_30_p2_s4(_3\.5mm)?', 'dwi_run+_acq-30p2s4'), @@ -397,7 +398,7 @@ def infotodict(seqinfo): # some are ok to skip and not to whine if "_Scout" in s.series_description or \ - (seqtype == 'anat' and seqtype_label.startswith('scout')): + (seqtype == 'anat' and seqtype_label and seqtype_label.startswith('scout')): skipped.append(s.series_id) lgr.debug("Ignoring %s", s.series_id) else: diff --git a/heuristics/dbic_bids_validator.cfg b/heuristics/dbic_bids_validator.cfg index bd14f996..5a05e5fb 100644 --- a/heuristics/dbic_bids_validator.cfg +++ b/heuristics/dbic_bids_validator.cfg @@ -2,6 +2,6 @@ "ignore": [], "warn": [], "error": [], - "ignoredFiles": ["/.heudiconv/*", "/.heudiconv/*/*", "/.heudiconv/*/*/*", "/.git*", "/.datalad/*"] + "ignoredFiles": ["/.heudiconv/*", "/.heudiconv/*/*", "/.heudiconv/*/*/*", "/.heudiconv/*/*/*/*", "/.git*", "/.datalad/*"] } From 5f5b400963896a6325b4cddfea65edc6d9a02f40 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 4 Mar 2017 14:14:11 -0500 Subject: [PATCH 120/181] Start adding small monitoring script --- bin/monitor.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 bin/monitor.py diff --git a/bin/monitor.py b/bin/monitor.py new file mode 100644 index 00000000..8669301e --- /dev/null +++ b/bin/monitor.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +import logging +import inotify.adapters +from inotify.constants import IN_MODIFY, IN_CREATE, IN_ISDIR +from collections import deque +import os +from datetime import date +import re +import time + +_DEFAULT_LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' +_LOGGER = logging.getLogger(__name__) + +MASK = (IN_MODIFY | IN_CREATE) +MASK_NEWDIR = (IN_CREATE | IN_ISDIR) +WAIT_TIME = 10 # in seconds + + +def _configure_logging(): + _LOGGER.setLevel(logging.DEBUG) + + ch = logging.StreamHandler() + + formatter = logging.Formatter(_DEFAULT_LOG_FORMAT) + ch.setFormatter(formatter) + + _LOGGER.addHandler(ch) + + +def process(paths2process): + if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: + process_me = paths2process.popleft() + print("Time to process {0}".format(process_me)) + time.sleep(3) + print("Done processing {0}".format(process_me)) + + +#"inbox/DICOM" "/20../../.." +def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..'): + paths2process = deque() + # watch only today's folder + path_re = re.compile("(%s%s)/?$" % (topdir, check_ptrn)) + i = inotify.adapters.InotifyTree(topdir.encode(), mask=MASK) + for event in i.event_gen(): + if event is not None: + (header, type_names, watch_path, filename) = event + if path_re.match(watch_path.decode('utf-8')): + # we got our directory, now let's do something on it + _LOGGER.info("WD=(%d) MASK=(%d) COOKIE=(%d) LEN=(%d) MASK->NAMES=%s " + "WATCH-PATH=[%s] FILENAME=[%s]", + header.wd, header.mask, header.cookie, header.len, type_names, + watch_path.decode('utf-8'), filename.decode('utf-8')) + newpath2process = os.path.join(watch_path, filename) + paths2process.append(newpath2process) + print(newpath2process) + # check if there's anything to process + process(paths2process) + + +if __name__ == '__main__': + _configure_logging() + monitor() From 9ec5dd6852ec83146373c77475dbea5743efdff2 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 4 Mar 2017 14:34:30 -0500 Subject: [PATCH 121/181] Add argument parser --- bin/monitor.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/bin/monitor.py b/bin/monitor.py index 8669301e..bb7bfa97 100644 --- a/bin/monitor.py +++ b/bin/monitor.py @@ -57,6 +57,16 @@ def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..'): process(paths2process) +def parse_args(): + import argparse + parser = argparse.ArgumentParser(prog='monitor.py', description='Small monitoring script to detect new directories and process them', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('path', help='Which directory to monitor') + parser.add_argument('--check_ptrn', '-p', help='regexp pattern for which subdirectories to check', default='/20../../..') + + return parser.parse_args() + + if __name__ == '__main__': _configure_logging() - monitor() + parsed = parse_args() + monitor(parsed.path, parsed.check_ptrn) From 74dc957a78c5b33e6ca25fe2f16cd6de76092ddc Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 4 Mar 2017 15:19:29 -0500 Subject: [PATCH 122/181] Add database logging --- bin/monitor.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/bin/monitor.py b/bin/monitor.py index bb7bfa97..a6cba89a 100644 --- a/bin/monitor.py +++ b/bin/monitor.py @@ -1,41 +1,50 @@ #!/usr/bin/env python import logging -import inotify.adapters -from inotify.constants import IN_MODIFY, IN_CREATE, IN_ISDIR -from collections import deque import os -from datetime import date import re +import subprocess import time +from collections import deque +from datetime import date +import inotify.adapters +from inotify.constants import IN_MODIFY, IN_CREATE, IN_ISDIR +from tinydb import TinyDB + _DEFAULT_LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' _LOGGER = logging.getLogger(__name__) MASK = (IN_MODIFY | IN_CREATE) MASK_NEWDIR = (IN_CREATE | IN_ISDIR) -WAIT_TIME = 10 # in seconds +WAIT_TIME = 2 # in seconds +# open database +db = TinyDB('database.json') def _configure_logging(): _LOGGER.setLevel(logging.DEBUG) - ch = logging.StreamHandler() - formatter = logging.Formatter(_DEFAULT_LOG_FORMAT) ch.setFormatter(formatter) - _LOGGER.addHandler(ch) def process(paths2process): + cmd = 'echo heudiconv {0}' if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: - process_me = paths2process.popleft() - print("Time to process {0}".format(process_me)) - time.sleep(3) - print("Done processing {0}".format(process_me)) + process_me = paths2process.popleft().decode('utf-8') + cmd_ = cmd.format(process_me) + try: + print("Time to process {0}".format(process_me)) + subprocess.check_call(cmd_.split()) + _LOGGER.info("Done running {0}".format(cmd_)) + # here we should inspect output and then store additional info + db.insert({'input_path': process_me, 'success': 1, 'subject_id': 'TODO', 'output_path': 'TODO'}) + except subprocess.CalledProcessError: + _LOGGER.error("{0} failed".format(cmd_)) + db.insert({'input_path': process_me, 'success': 0}) -#"inbox/DICOM" "/20../../.." def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..'): paths2process = deque() # watch only today's folder From 42f7ff10d65337726c7b47b4f10b495d9cbe57e5 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 4 Mar 2017 18:37:05 -0500 Subject: [PATCH 123/181] Add initial test --- bin/monitor.py | 86 ++++++++++++++++++++++++++++----------------- bin/test_monitor.py | 43 +++++++++++++++++++++++ 2 files changed, 97 insertions(+), 32 deletions(-) create mode 100644 bin/test_monitor.py diff --git a/bin/monitor.py b/bin/monitor.py index a6cba89a..4d472699 100644 --- a/bin/monitor.py +++ b/bin/monitor.py @@ -5,7 +5,7 @@ import subprocess import time -from collections import deque +from collections import deque, OrderedDict from datetime import date import inotify.adapters from inotify.constants import IN_MODIFY, IN_CREATE, IN_ISDIR @@ -16,10 +16,8 @@ MASK = (IN_MODIFY | IN_CREATE) MASK_NEWDIR = (IN_CREATE | IN_ISDIR) -WAIT_TIME = 2 # in seconds +WAIT_TIME = 10 # in seconds -# open database -db = TinyDB('database.json') def _configure_logging(): _LOGGER.setLevel(logging.DEBUG) @@ -29,41 +27,62 @@ def _configure_logging(): _LOGGER.addHandler(ch) -def process(paths2process): +def process(paths2process, db): cmd = 'echo heudiconv {0}' - if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: - process_me = paths2process.popleft().decode('utf-8') - cmd_ = cmd.format(process_me) - try: - print("Time to process {0}".format(process_me)) - subprocess.check_call(cmd_.split()) - _LOGGER.info("Done running {0}".format(cmd_)) - # here we should inspect output and then store additional info - db.insert({'input_path': process_me, 'success': 1, 'subject_id': 'TODO', 'output_path': 'TODO'}) - except subprocess.CalledProcessError: - _LOGGER.error("{0} failed".format(cmd_)) - db.insert({'input_path': process_me, 'success': 0}) - - -def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..'): - paths2process = deque() + #if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: + processed = [] + for path, mod_time in paths2process.items(): + if time.time() - mod_time > WAIT_TIME: + #process_me = paths2process.popleft().decode('utf-8') + process_me = path + cmd_ = cmd.format(process_me) + try: + print("Time to process {0}".format(process_me)) + subprocess.check_call(cmd_.split()) + _LOGGER.info("Done running {0}".format(cmd_)) + # here we should inspect output and then store additional info + db.insert({'input_path': process_me, 'success': 1, 'subject_id': 'TODO', 'output_path': 'TODO'}) + except subprocess.CalledProcessError: + _LOGGER.error("{0} failed".format(cmd_)) + db.insert({'input_path': process_me, 'success': 0}) + # if we processed it, or it failed, we need to remove it to avoid running it again + processed.append(path) + for processed_path in processed: + del paths2process[processed_path] + + +def inspect_heudiconv_output(heudiconv_output): + pass + + +def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None): + #paths2process = deque() + paths2process = OrderedDict() # watch only today's folder path_re = re.compile("(%s%s)/?$" % (topdir, check_ptrn)) - i = inotify.adapters.InotifyTree(topdir.encode(), mask=MASK) + i = inotify.adapters.InotifyTree(topdir.encode())#, mask=MASK) for event in i.event_gen(): if event is not None: (header, type_names, watch_path, filename) = event - if path_re.match(watch_path.decode('utf-8')): + _LOGGER.info("WD=(%d) MASK=(%d) COOKIE=(%d) LEN=(%d) MASK->NAMES=%s " + "WATCH-PATH=[%s] FILENAME=[%s]", + header.wd, header.mask, header.cookie, header.len, type_names, + watch_path.decode('utf-8'), filename.decode('utf-8')) + if header.mask == MASK_NEWDIR and path_re.match(watch_path.decode('utf-8')): # we got our directory, now let's do something on it - _LOGGER.info("WD=(%d) MASK=(%d) COOKIE=(%d) LEN=(%d) MASK->NAMES=%s " - "WATCH-PATH=[%s] FILENAME=[%s]", - header.wd, header.mask, header.cookie, header.len, type_names, - watch_path.decode('utf-8'), filename.decode('utf-8')) - newpath2process = os.path.join(watch_path, filename) - paths2process.append(newpath2process) - print(newpath2process) + newpath2process = os.path.join(watch_path, filename).decode('utf-8') + #paths2process.append(newpath2process) + # update time + paths2process[newpath2process] = time.time() + print(newpath2process, time.time()) + # check if we need to update the time + for path in paths2process.keys(): + if path in watch_path.decode('utf-8'): + paths2process[path] = time.time() + print("Updating {0}: {1}".format(path, paths2process[path])) + # check if there's anything to process - process(paths2process) + process(paths2process, db) def parse_args(): @@ -71,6 +90,7 @@ def parse_args(): parser = argparse.ArgumentParser(prog='monitor.py', description='Small monitoring script to detect new directories and process them', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('path', help='Which directory to monitor') parser.add_argument('--check_ptrn', '-p', help='regexp pattern for which subdirectories to check', default='/20../../..') + parser.add_argument('--database', '-d', help='database location', default='database.json') return parser.parse_args() @@ -78,4 +98,6 @@ def parse_args(): if __name__ == '__main__': _configure_logging() parsed = parse_args() - monitor(parsed.path, parsed.check_ptrn) + # open database + db = TinyDB(parsed.database) + monitor(parsed.path, parsed.check_ptrn, db) diff --git a/bin/test_monitor.py b/bin/test_monitor.py new file mode 100644 index 00000000..681ddde5 --- /dev/null +++ b/bin/test_monitor.py @@ -0,0 +1,43 @@ +from collections import namedtuple +import pytest +from mock import patch +from monitor import monitor, MASK_NEWDIR + + +class MockInotifyTree(object): + def __init__(self, events): + self.events = iter(events) + def event_gen(self): + for e in self.events: + yield e + def __call__(self, topdir): + return self + + +class MockTime(object): + def __init__(self, time): + self.time = time + def __call__(self): + return self.time + + +Header = namedtuple('header', ['wd', 'mask', 'cookie', 'len']) +header = Header(5, MASK_NEWDIR, 5, 5) +watch_path = b'WATCHME' +filename = b'FILE' +type_names = b'TYPE' + +path2 = watch_path + b'/' + filename + b'/subpath' + +my_events = [(header, type_names, watch_path, filename), + (header, type_names, path2, b'')] + +@patch('inotify.adapters.InotifyTree', MockInotifyTree(my_events)) +@patch('time.time', MockTime(42)) +def test_monitor(capsys): + monitor(watch_path.decode(), check_ptrn='') + out, err = capsys.readouterr() + desired_output = '{0}/{1} {2}\n'.format(watch_path.decode(), filename.decode(), 42) + desired_output += 'Updating {0}/{1}: {2}\n'.format(watch_path.decode(), filename.decode(), 42) + assert out == desired_output + From 6fe0e278a76059c67df636001a43e6e393eb9838 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 10:41:47 -0500 Subject: [PATCH 124/181] Add more tests --- bin/monitor.py | 18 +++++++++--------- bin/test_monitor.py | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/bin/monitor.py b/bin/monitor.py index 4d472699..c6e279d4 100644 --- a/bin/monitor.py +++ b/bin/monitor.py @@ -19,20 +19,20 @@ WAIT_TIME = 10 # in seconds -def _configure_logging(): - _LOGGER.setLevel(logging.DEBUG) - ch = logging.StreamHandler() - formatter = logging.Formatter(_DEFAULT_LOG_FORMAT) - ch.setFormatter(formatter) - _LOGGER.addHandler(ch) +#def _configure_logging(): +_LOGGER.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +formatter = logging.Formatter(_DEFAULT_LOG_FORMAT) +ch.setFormatter(formatter) +_LOGGER.addHandler(ch) -def process(paths2process, db): +def process(paths2process, db, wait=WAIT_TIME): cmd = 'echo heudiconv {0}' #if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: processed = [] for path, mod_time in paths2process.items(): - if time.time() - mod_time > WAIT_TIME: + if time.time() - mod_time > wait: #process_me = paths2process.popleft().decode('utf-8') process_me = path cmd_ = cmd.format(process_me) @@ -99,5 +99,5 @@ def parse_args(): _configure_logging() parsed = parse_args() # open database - db = TinyDB(parsed.database) + db = TinyDB(parsed.database, default_table='heudiconv') monitor(parsed.path, parsed.check_ptrn, db) diff --git a/bin/test_monitor.py b/bin/test_monitor.py index 681ddde5..f685e8b1 100644 --- a/bin/test_monitor.py +++ b/bin/test_monitor.py @@ -1,7 +1,10 @@ from collections import namedtuple import pytest from mock import patch -from monitor import monitor, MASK_NEWDIR +from monitor import monitor, process, MASK_NEWDIR +from os.path import exists +from tinydb import TinyDB, Query +from subprocess import CalledProcessError class MockInotifyTree(object): @@ -40,4 +43,32 @@ def test_monitor(capsys): desired_output = '{0}/{1} {2}\n'.format(watch_path.decode(), filename.decode(), 42) desired_output += 'Updating {0}/{1}: {2}\n'.format(watch_path.decode(), filename.decode(), 42) assert out == desired_output - + + +@patch('time.time', MockTime(42)) +@pytest.mark.parametrize("side_effect,success", [ + (None, 1), + (CalledProcessError('mycmd', 1), 0) +]) +def test_process_success(tmpdir, capsys, side_effect, success): + db_fn = tmpdir.join('database.json') + db = TinyDB(db_fn.strpath) + paths2process = {'/my/path': 42} + # test 1: everything ok + with patch('subprocess.check_call') as mocked_call: + mocked_call.side_effect = side_effect + process(paths2process, db, wait=-30) + out, err = capsys.readouterr() + + mocked_call.assert_called_once() + assert db_fn.check() + # dictionary should be empty + assert not paths2process + assert out == 'Time to process /my/path\n' + + # check what we have in the database + Path = Query() + query = db.get(Path.input_path == '/my/path') + assert len(db) == 1 + assert query + assert query['success'] == success From 5052c087403fd24236dde56d0db0ec05339c6efb Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 10:45:08 -0500 Subject: [PATCH 125/181] Rename test --- bin/test_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/test_monitor.py b/bin/test_monitor.py index f685e8b1..3984cb4c 100644 --- a/bin/test_monitor.py +++ b/bin/test_monitor.py @@ -50,7 +50,7 @@ def test_monitor(capsys): (None, 1), (CalledProcessError('mycmd', 1), 0) ]) -def test_process_success(tmpdir, capsys, side_effect, success): +def test_process(tmpdir, capsys, side_effect, success): db_fn = tmpdir.join('database.json') db = TinyDB(db_fn.strpath) paths2process = {'/my/path': 42} From 45c87a77182575d7e7f7fd8838a4337fd33ae1bb Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 13:47:55 -0500 Subject: [PATCH 126/181] Heudiconv outputs info on what subject is being processed, gets captured by monitor to update database --- bin/heudiconv | 5 ++++- bin/monitor.py | 35 ++++++++++++++++++++++------------- bin/test_monitor.py | 23 ++++++++++++++++++----- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 5496d52c..ceb5165e 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1705,7 +1705,8 @@ def _main(args): anon_outdir, anon_study_outdir, msg="Preparing for %s" % datalad_msg_suf, bids=args.bids) - + lgr.info("PROCESSING STARTS: {0}".format( + str(dict(subject=sid, outdir=study_outdir, session=session)))) convert_dicoms( sid, dicoms, @@ -1718,6 +1719,8 @@ def _main(args): ses=session, is_bids=args.bids, seqinfo=seqinfo) + lgr.info("PROCESSING DONE: {0}".format( + str(dict(subject=sid, outdir=study_outdir, session=session)))) if args.datalad: msg = "Converted subject %s" % datalad_msg_suf diff --git a/bin/monitor.py b/bin/monitor.py index c6e279d4..6a5c3c5b 100644 --- a/bin/monitor.py +++ b/bin/monitor.py @@ -27,6 +27,23 @@ _LOGGER.addHandler(ch) +def run_heudiconv(cmd): + info_dict = dict() + proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return_code = proc.wait() + if return_code == 0: + _LOGGER.info("Done running {0}".format(cmd)) + info_dict['success'] = 1 + else: + _LOGGER.error("{0} failed".format(cmd)) + info_dict['success'] = 0 + # get info on what we run + match = re.match('INFO: PROCESSING STARTS: (.*)', proc.communicate()[0].decode('utf-8')) + info_dict_ = eval(match.group(1) if match else '') + info_dict.update(info_dict_) + return info_dict + + def process(paths2process, db, wait=WAIT_TIME): cmd = 'echo heudiconv {0}' #if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: @@ -36,25 +53,17 @@ def process(paths2process, db, wait=WAIT_TIME): #process_me = paths2process.popleft().decode('utf-8') process_me = path cmd_ = cmd.format(process_me) - try: - print("Time to process {0}".format(process_me)) - subprocess.check_call(cmd_.split()) - _LOGGER.info("Done running {0}".format(cmd_)) - # here we should inspect output and then store additional info - db.insert({'input_path': process_me, 'success': 1, 'subject_id': 'TODO', 'output_path': 'TODO'}) - except subprocess.CalledProcessError: - _LOGGER.error("{0} failed".format(cmd_)) - db.insert({'input_path': process_me, 'success': 0}) + process_dict = {'input_path': process_me} + print("Time to process {0}".format(process_me)) + run_dict = run_heudiconv(cmd_) + process_dict.update(run_dict) + db.insert(process_dict) # if we processed it, or it failed, we need to remove it to avoid running it again processed.append(path) for processed_path in processed: del paths2process[processed_path] -def inspect_heudiconv_output(heudiconv_output): - pass - - def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None): #paths2process = deque() paths2process = OrderedDict() diff --git a/bin/test_monitor.py b/bin/test_monitor.py index 3984cb4c..b66b3293 100644 --- a/bin/test_monitor.py +++ b/bin/test_monitor.py @@ -1,7 +1,7 @@ from collections import namedtuple import pytest from mock import patch -from monitor import monitor, process, MASK_NEWDIR +from monitor import monitor, process, run_heudiconv, MASK_NEWDIR from os.path import exists from tinydb import TinyDB, Query from subprocess import CalledProcessError @@ -54,13 +54,18 @@ def test_process(tmpdir, capsys, side_effect, success): db_fn = tmpdir.join('database.json') db = TinyDB(db_fn.strpath) paths2process = {'/my/path': 42} - # test 1: everything ok - with patch('subprocess.check_call') as mocked_call: - mocked_call.side_effect = side_effect + with patch('subprocess.Popen') as mocked_popen: + mocked_popen_instance = mocked_popen.return_value + mocked_popen_instance.side_effect = side_effect + mocked_popen_instance.communicate.return_value = (b"INFO: PROCESSING STARTS: {'just': 'a test'}", ) + # set return value for wait + mocked_popen_instance.wait.return_value = 1 - success + # mock also communicate to get the supposed stdout + # mocked_popen.communicate = lambda: (b"INFO: PROCESSING STARTS: {'just': 'a test'}", ) process(paths2process, db, wait=-30) out, err = capsys.readouterr() - mocked_call.assert_called_once() + mocked_popen.assert_called_once() assert db_fn.check() # dictionary should be empty assert not paths2process @@ -72,3 +77,11 @@ def test_process(tmpdir, capsys, side_effect, success): assert len(db) == 1 assert query assert query['success'] == success + + +def test_run_heudiconv(): + # echo should succeed always + mydict = {'key1': 'value1', 'key2': 'value2', 'success': 1} + cmd = "echo INFO: PROCESSING STARTS: {0}".format(str(mydict)) + out = run_heudiconv(cmd) + assert out == mydict From f16e165b6d00827a340194ef1620ba67d2cd6287 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 14:12:10 -0500 Subject: [PATCH 127/181] Add accession number to database --- bin/monitor.py | 2 +- bin/test_monitor.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/monitor.py b/bin/monitor.py index 6a5c3c5b..870858a6 100644 --- a/bin/monitor.py +++ b/bin/monitor.py @@ -53,7 +53,7 @@ def process(paths2process, db, wait=WAIT_TIME): #process_me = paths2process.popleft().decode('utf-8') process_me = path cmd_ = cmd.format(process_me) - process_dict = {'input_path': process_me} + process_dict = {'input_path': process_me, 'accession_number': os.path.basename(process_me)} print("Time to process {0}".format(process_me)) run_dict = run_heudiconv(cmd_) process_dict.update(run_dict) diff --git a/bin/test_monitor.py b/bin/test_monitor.py index b66b3293..112f7027 100644 --- a/bin/test_monitor.py +++ b/bin/test_monitor.py @@ -1,4 +1,5 @@ from collections import namedtuple +import os import pytest from mock import patch from monitor import monitor, process, run_heudiconv, MASK_NEWDIR @@ -53,7 +54,8 @@ def test_monitor(capsys): def test_process(tmpdir, capsys, side_effect, success): db_fn = tmpdir.join('database.json') db = TinyDB(db_fn.strpath) - paths2process = {'/my/path': 42} + process_me = '/my/path/A12345' + paths2process = {process_me: 42} with patch('subprocess.Popen') as mocked_popen: mocked_popen_instance = mocked_popen.return_value mocked_popen_instance.side_effect = side_effect @@ -69,14 +71,16 @@ def test_process(tmpdir, capsys, side_effect, success): assert db_fn.check() # dictionary should be empty assert not paths2process - assert out == 'Time to process /my/path\n' + assert out == 'Time to process {0}\n'.format(process_me) # check what we have in the database Path = Query() - query = db.get(Path.input_path == '/my/path') + query = db.get(Path.input_path == process_me) assert len(db) == 1 assert query assert query['success'] == success + assert query['accession_number'] == os.path.basename(process_me) + assert query['just'] == 'a test' def test_run_heudiconv(): From 071d863f90508c431c670440a1de086912924fb8 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 15:20:07 -0500 Subject: [PATCH 128/181] Add logdir option --- bin/monitor.py | 23 +++++++++++++++-------- bin/test_monitor.py | 17 ++++++++++++----- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/bin/monitor.py b/bin/monitor.py index 870858a6..309d9975 100644 --- a/bin/monitor.py +++ b/bin/monitor.py @@ -9,6 +9,7 @@ from datetime import date import inotify.adapters from inotify.constants import IN_MODIFY, IN_CREATE, IN_ISDIR +from py.path import local as localpath from tinydb import TinyDB _DEFAULT_LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' @@ -16,7 +17,7 @@ MASK = (IN_MODIFY | IN_CREATE) MASK_NEWDIR = (IN_CREATE | IN_ISDIR) -WAIT_TIME = 10 # in seconds +WAIT_TIME = 86400 # in seconds #def _configure_logging(): @@ -38,13 +39,14 @@ def run_heudiconv(cmd): _LOGGER.error("{0} failed".format(cmd)) info_dict['success'] = 0 # get info on what we run - match = re.match('INFO: PROCESSING STARTS: (.*)', proc.communicate()[0].decode('utf-8')) + stdout = proc.communicate()[0].decode('utf-8') + match = re.match('INFO: PROCESSING STARTS: (.*)', stdout) info_dict_ = eval(match.group(1) if match else '') info_dict.update(info_dict_) - return info_dict + return stdout, info_dict -def process(paths2process, db, wait=WAIT_TIME): +def process(paths2process, db, wait=WAIT_TIME, logdir='log'): cmd = 'echo heudiconv {0}' #if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: processed = [] @@ -55,16 +57,20 @@ def process(paths2process, db, wait=WAIT_TIME): cmd_ = cmd.format(process_me) process_dict = {'input_path': process_me, 'accession_number': os.path.basename(process_me)} print("Time to process {0}".format(process_me)) - run_dict = run_heudiconv(cmd_) + stdout, run_dict = run_heudiconv(cmd_) process_dict.update(run_dict) db.insert(process_dict) + # save log + logdir = localpath(logdir) + log = logdir.join(process_dict['accession_number'] + '.log') + log.write(stdout) # if we processed it, or it failed, we need to remove it to avoid running it again processed.append(path) for processed_path in processed: del paths2process[processed_path] -def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None): +def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None, wait=WAIT_TIME): #paths2process = deque() paths2process = OrderedDict() # watch only today's folder @@ -91,7 +97,7 @@ def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None): print("Updating {0}: {1}".format(path, paths2process[path])) # check if there's anything to process - process(paths2process, db) + process(paths2process, db, wait=wait) def parse_args(): @@ -100,6 +106,7 @@ def parse_args(): parser.add_argument('path', help='Which directory to monitor') parser.add_argument('--check_ptrn', '-p', help='regexp pattern for which subdirectories to check', default='/20../../..') parser.add_argument('--database', '-d', help='database location', default='database.json') + parser.add_argument('--wait_time', '-w', help='After how long should we start processing datasets? (in seconds)', default=86400) return parser.parse_args() @@ -109,4 +116,4 @@ def parse_args(): parsed = parse_args() # open database db = TinyDB(parsed.database, default_table='heudiconv') - monitor(parsed.path, parsed.check_ptrn, db) + monitor(parsed.path, parsed.check_ptrn, db, wait=parsed.wait_time) diff --git a/bin/test_monitor.py b/bin/test_monitor.py index 112f7027..78c0285d 100644 --- a/bin/test_monitor.py +++ b/bin/test_monitor.py @@ -53,21 +53,27 @@ def test_monitor(capsys): ]) def test_process(tmpdir, capsys, side_effect, success): db_fn = tmpdir.join('database.json') + log_dir = tmpdir.mkdir('log') db = TinyDB(db_fn.strpath) process_me = '/my/path/A12345' + accession_number = os.path.basename(process_me) paths2process = {process_me: 42} with patch('subprocess.Popen') as mocked_popen: + stdout = b"INFO: PROCESSING STARTS: {'just': 'a test'}" mocked_popen_instance = mocked_popen.return_value mocked_popen_instance.side_effect = side_effect - mocked_popen_instance.communicate.return_value = (b"INFO: PROCESSING STARTS: {'just': 'a test'}", ) + mocked_popen_instance.communicate.return_value = (stdout, ) # set return value for wait mocked_popen_instance.wait.return_value = 1 - success # mock also communicate to get the supposed stdout # mocked_popen.communicate = lambda: (b"INFO: PROCESSING STARTS: {'just': 'a test'}", ) - process(paths2process, db, wait=-30) + process(paths2process, db, wait=-30, logdir=log_dir.strpath) out, err = capsys.readouterr() - + log_fn = log_dir.join(accession_number + '.log') + mocked_popen.assert_called_once() + assert log_fn.check() + assert log_fn.read() == stdout.decode('utf-8') assert db_fn.check() # dictionary should be empty assert not paths2process @@ -87,5 +93,6 @@ def test_run_heudiconv(): # echo should succeed always mydict = {'key1': 'value1', 'key2': 'value2', 'success': 1} cmd = "echo INFO: PROCESSING STARTS: {0}".format(str(mydict)) - out = run_heudiconv(cmd) - assert out == mydict + stdout, info_dict = run_heudiconv(cmd) + assert info_dict == mydict + assert "echo " + stdout.strip() == cmd From e18fefd7b441c7a6ca08a050a731c605fd4968fe Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 15:26:10 -0500 Subject: [PATCH 129/181] Rename monitor file, make logdir --- bin/{monitor.py => heudiconv_monitor} | 13 ++++++++++--- bin/{test_monitor.py => test_heudiconv_monitor.py} | 0 2 files changed, 10 insertions(+), 3 deletions(-) rename bin/{monitor.py => heudiconv_monitor} (93%) rename bin/{test_monitor.py => test_heudiconv_monitor.py} (100%) diff --git a/bin/monitor.py b/bin/heudiconv_monitor similarity index 93% rename from bin/monitor.py rename to bin/heudiconv_monitor index 309d9975..79472b64 100644 --- a/bin/monitor.py +++ b/bin/heudiconv_monitor @@ -70,7 +70,12 @@ def process(paths2process, db, wait=WAIT_TIME, logdir='log'): del paths2process[processed_path] -def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None, wait=WAIT_TIME): +def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None, wait=WAIT_TIME, logdir='log'): + # make logdir if not existant + try: + os.makedirs(parsed.logdir) + except OSError: + pass #paths2process = deque() paths2process = OrderedDict() # watch only today's folder @@ -97,7 +102,7 @@ def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None, wait=WAIT_ print("Updating {0}: {1}".format(path, paths2process[path])) # check if there's anything to process - process(paths2process, db, wait=wait) + process(paths2process, db, wait=wait, logdir=logdir) def parse_args(): @@ -107,6 +112,8 @@ def parse_args(): parser.add_argument('--check_ptrn', '-p', help='regexp pattern for which subdirectories to check', default='/20../../..') parser.add_argument('--database', '-d', help='database location', default='database.json') parser.add_argument('--wait_time', '-w', help='After how long should we start processing datasets? (in seconds)', default=86400) + parser.add_argument('--logdir', '-l', help='Where should we save the logs?', default='log') + return parser.parse_args() @@ -116,4 +123,4 @@ def parse_args(): parsed = parse_args() # open database db = TinyDB(parsed.database, default_table='heudiconv') - monitor(parsed.path, parsed.check_ptrn, db, wait=parsed.wait_time) + monitor(parsed.path, parsed.check_ptrn, db, wait=parsed.wait_time, logdir=parsed.logdir) diff --git a/bin/test_monitor.py b/bin/test_heudiconv_monitor.py similarity index 100% rename from bin/test_monitor.py rename to bin/test_heudiconv_monitor.py From cd658fc768967565d0e4bc0527f91af8f6832543 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 15:26:40 -0500 Subject: [PATCH 130/181] Make heudiconv_monitor executable --- bin/heudiconv_monitor | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/heudiconv_monitor diff --git a/bin/heudiconv_monitor b/bin/heudiconv_monitor old mode 100644 new mode 100755 From 410103b87934a5398e560ca773ca47e3246b8b47 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 15:27:22 -0500 Subject: [PATCH 131/181] Automatically configure logging --- bin/heudiconv_monitor | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/heudiconv_monitor b/bin/heudiconv_monitor index 79472b64..36445b6e 100755 --- a/bin/heudiconv_monitor +++ b/bin/heudiconv_monitor @@ -119,7 +119,6 @@ def parse_args(): if __name__ == '__main__': - _configure_logging() parsed = parse_args() # open database db = TinyDB(parsed.database, default_table='heudiconv') From 71102e7407210d0c7a94040e03224a6165784db9 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 15:42:01 -0500 Subject: [PATCH 132/181] Use json instead of eval --- bin/heudiconv_monitor | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/heudiconv_monitor b/bin/heudiconv_monitor index 36445b6e..8a52f7d2 100755 --- a/bin/heudiconv_monitor +++ b/bin/heudiconv_monitor @@ -4,6 +4,7 @@ import os import re import subprocess import time +import json from collections import deque, OrderedDict from datetime import date @@ -41,7 +42,7 @@ def run_heudiconv(cmd): # get info on what we run stdout = proc.communicate()[0].decode('utf-8') match = re.match('INFO: PROCESSING STARTS: (.*)', stdout) - info_dict_ = eval(match.group(1) if match else '') + info_dict_ = json.loads(match.group(1) if match else '{}') info_dict.update(info_dict_) return stdout, info_dict From 195e9936e816fa1cee59498de37da039c78e1b86 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sun, 5 Mar 2017 16:27:05 -0500 Subject: [PATCH 133/181] ls instead of echoing to debug --- bin/heudiconv_monitor | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/heudiconv_monitor b/bin/heudiconv_monitor index 8a52f7d2..2faec825 100755 --- a/bin/heudiconv_monitor +++ b/bin/heudiconv_monitor @@ -48,7 +48,7 @@ def run_heudiconv(cmd): def process(paths2process, db, wait=WAIT_TIME, logdir='log'): - cmd = 'echo heudiconv {0}' + cmd = 'ls -l {0}' #if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: processed = [] for path, mod_time in paths2process.items(): @@ -112,7 +112,7 @@ def parse_args(): parser.add_argument('path', help='Which directory to monitor') parser.add_argument('--check_ptrn', '-p', help='regexp pattern for which subdirectories to check', default='/20../../..') parser.add_argument('--database', '-d', help='database location', default='database.json') - parser.add_argument('--wait_time', '-w', help='After how long should we start processing datasets? (in seconds)', default=86400) + parser.add_argument('--wait_time', '-w', help='After how long should we start processing datasets? (in seconds)', default=86400, type=float) parser.add_argument('--logdir', '-l', help='Where should we save the logs?', default='log') @@ -121,6 +121,7 @@ def parse_args(): if __name__ == '__main__': parsed = parse_args() + print('Got {0}'.format(parsed)) # open database db = TinyDB(parsed.database, default_table='heudiconv') monitor(parsed.path, parsed.check_ptrn, db, wait=parsed.wait_time, logdir=parsed.logdir) From d5f54a3ee094bdb6f1b8d342a3bf129965233649 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 7 Mar 2017 09:43:22 -0500 Subject: [PATCH 134/181] add dcmtk for dcmdump --- custom/dbic/singularity-env.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom/dbic/singularity-env.def b/custom/dbic/singularity-env.def index fc83a1f5..a7722e4a 100644 --- a/custom/dbic/singularity-env.def +++ b/custom/dbic/singularity-env.def @@ -42,7 +42,7 @@ MirrorURL: http://ftp.us.debian.org/debian/ wget -q -O/tmp/nd-configurerepo https://raw.githubusercontent.com/neurodebian/neurodebian/4d26c8f30433145009aa3f74516da12f560a5a13/tools/nd-configurerepo bash /tmp/nd-configurerepo chmod a+r -R /etc/apt - eatmydata apt-get -y install datalad python-nipype virtualenv dcm2niix python-dcmstack python-configparser python-funcsigs python-pytest + eatmydata apt-get -y install datalad python-nipype virtualenv dcm2niix python-dcmstack python-configparser python-funcsigs python-pytest dcmtk # for bids-validator curl -sL https://deb.nodesource.com/setup_6.x | bash - && \ From c3d27a3c5c1d5a6f9b79b7c1dc37320926bc09e0 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 9 Mar 2017 12:29:01 -0500 Subject: [PATCH 135/181] BF+TST: in case of a provided template, do not output groupped dictionary --- bin/heudiconv | 9 ++++++--- tests/test_heuristics.py | 27 +++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 5496d52c..4dd9e4d6 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -296,7 +296,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud dcmfilter : callable, optional If called on dcm_data and returns True, it is used to set series_id - grouping : str ('studyUID', 'accession_number') + grouping : str ('studyUID', 'accession_number') or None, optional what to group by: studyUID or accession_number Returns @@ -307,7 +307,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud filegrp : dict `filegrp` is a dictionary with files groupped per each sequence """ - allowed_groupings = ['studyUID', 'accession_number'] + allowed_groupings = ['studyUID', 'accession_number', None] if grouping not in allowed_groupings: raise ValueError('I do not know how to group by {0}'.format(grouping)) per_studyUID = grouping == 'studyUID' @@ -1114,8 +1114,11 @@ def convert_dicoms(sid, seqinfo = group_dicoms_into_seqinfos( dicoms, flfilter=getattr(heuristic, 'filter_files', None), - dcmfilter=getattr(heuristic, 'filter_dicom', None)) + dcmfilter=getattr(heuristic, 'filter_dicom', None), + grouping=None, # no groupping + ) seqinfo_list = list(seqinfo.keys()) + filegroup = {si.series_id: x for si, x in seqinfo.items()} save_json(filegroup_file, filegroup) diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index 67cb4c7c..212b3c88 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -16,9 +16,23 @@ def test_smoke_converall(tmpdir): ) -def test_dbic_bids_largely_smoke(tmpdir): - args = ("-f heuristics/dbic_bids.py -c dcm2niix -o %s -b " - "--datalad tests/data" % tmpdir).split(' '); +@pytest.mark.parametrize('heuristic', [ 'dbic_bids', 'convertall' ]) +@pytest.mark.parametrize( + 'invocation', [ + "tests/data", # our new way with automated groupping + "-d tests/data/%s/* -s 01-fmap_acq-3mm" # "old" way specifying subject + # should produce the same results + ]) +def test_dbic_bids_largely_smoke(tmpdir, heuristic, invocation): + args = ( + ("-f heuristics/%s.py -c dcm2niix -o %s -b --datalad " % (heuristic, tmpdir)) + + invocation + ).split(' ') + if heuristic != 'dbic_bids' and invocation == 'tests/data': + # none other heuristic has mighty infotoids atm + with pytest.raises(NotImplementedError): + heudiconv.main(args) + return heudiconv.main(args) ds = Dataset(str(tmpdir)) assert ds.is_installed() @@ -26,8 +40,13 @@ def test_dbic_bids_largely_smoke(tmpdir): head = ds.repo.get_hexsha() # and if we rerun -- should fail - with pytest.raises(RuntimeError): + if heuristic != 'dbic_bids' and invocation != 'tests/data': + # those guys -- they just plow through it ATM without failing, i.e. + # the logic is to reprocess heudiconv.main(args) + else: + with pytest.raises(RuntimeError): + heudiconv.main(args) # but there should be nothing new assert not ds.repo.dirty assert head == ds.repo.get_hexsha() From 79f1a0d2434ef219835ce0306e69b3c92e93f281 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Tue, 28 Mar 2017 15:06:08 -0500 Subject: [PATCH 136/181] BF: added "read bit" to make chmodded files readable --- bin/heudiconv | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 83f57c64..72afe291 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -913,7 +913,7 @@ def convert(items, symlink=True, converter=None, if exists(scaninfo): lgr.info("Post-treating %s file", scaninfo) treat_infofile(scaninfo) - os.chmod(outname, 0o0440) + os.chmod(outname, 0o0444) if custom_callable is not None: custom_callable(*item) @@ -949,9 +949,9 @@ def tuneup_bids_json_files(json_files): json.load(open(json_basename + '_magnitude%d.json' % i))[ 'EchoTime'] # might have been made R/O already - os.chmod(json_phasediffname, 0o0660) + os.chmod(json_phasediffname, 0o0664) json.dump(json_, open(json_phasediffname, 'w'), indent=2) - os.chmod(json_phasediffname, 0o0440) + os.chmod(json_phasediffname, 0o0444) # phasediff one should contain two PhaseDiff's # -- one for original amplitude and the other already replicating what is there @@ -1022,15 +1022,15 @@ in that one though if global_options['overwrite'] and lexists(scaninfo): # TODO: handle annexed file case if not os.path.islink(scaninfo): - os.chmod(scaninfo, 0o0660) + os.chmod(scaninfo, 0o0664) res = embedfunc.run() - os.chmod(scaninfo, 0o0440) + os.chmod(scaninfo, 0o0444) if with_prov: g = res.provenance.rdf() g.parse(prov_file, format='turtle') g.serialize(prov_file, format='turtle') - os.chmod(prov_file, 0o0440) + os.chmod(prov_file, 0o0444) except Exception as exc: lgr.error("Embedding failed: %s", str(exc)) os.chdir(cwd) @@ -1047,10 +1047,10 @@ def treat_infofile(filename): j_slim = slim_down_info(j) j_pretty = json_dumps_pretty(j_slim, indent=2, sort_keys=True) - os.chmod(filename, 0o0660) + os.chmod(filename, 0o0664) with open(filename, 'wt') as fp: fp.write(j_pretty) - os.chmod(filename, 0o0440) + os.chmod(filename, 0o0444) def convert_dicoms(sid, From df6071316baf444b1900740a66d29ea874c923d8 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Mon, 3 Apr 2017 14:43:58 -0500 Subject: [PATCH 137/181] fixups for one of the studies --- heuristics/dbic_bids.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 8b2d1cb1..e59852cd 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -77,7 +77,14 @@ '76b36c80231b0afaf509e2d52046e964': [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')], 'c6d8fbccc72990bee61d28e73b2618a4': - [('run=', 'run+')] + [('run=', 'run+')], + 'a751cc977f1e354fcafcb0ea2de123bd': + [ + ('_unlabeled', '_task-unlabeled'), + ('_mSense', '_acq-mSense'), + ('_p1_sms4_2.5mm', '_acq-p1-sms4-2.5mm'), + ('_p1_sms4_3mm', '_acq-p1-sms4-3mm'), + ], } keys2replace = ['protocol_name', 'series_description'] From f910e4a9176f1f07987fee633bf29afc4c0cc131 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Tue, 4 Apr 2017 09:27:05 -0500 Subject: [PATCH 138/181] ENH: few more skipped runs for a study and run0 -> run-0 for another --- heuristics/dbic_bids.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index e59852cd..3c045de0 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -22,6 +22,11 @@ 'A000130': ['^15-'], 'A000137': ['^9-', '^11-'], 'A000297': ['^12-'], + 'A000376': ['^15-'], + 'A000384': ['^8-', '^11-'], + 'A000467': ['^15-'], + 'A000490': ['^15-'], + 'A000511': ['^15-'], } # dictionary containing fixes, keys are md5sum of study_description from @@ -85,6 +90,10 @@ ('_p1_sms4_2.5mm', '_acq-p1-sms4-2.5mm'), ('_p1_sms4_3mm', '_acq-p1-sms4-3mm'), ], + 'd160113cf5ea8c5d0cbbbe14ef625e76': + [ + ('_run0', '_run-0'), + ], } keys2replace = ['protocol_name', 'series_description'] From 87354ee8897171aab3a72e090b095cffe593615c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 4 Apr 2017 12:42:33 -0400 Subject: [PATCH 139/181] ENH: support _ses-{date} --- heuristics/dbic_bids.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 3c045de0..97e84f4f 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -500,10 +500,17 @@ def infotoids(seqinfos, outdir): # and possible ses+ in the sequence names, so we would provide a sequence # So might need to go through parse_dbic_protocol_name(s.protocol_name) # to figure out presence of sessions. - ses_markers = [ - parse_dbic_protocol_name(s.protocol_name).get('session', None) for s in seqinfos - if not s.is_derived - ] + ses_markers = [] + for s in seqinfos: + if s.is_derived: + continue + session_ = parse_dbic_protocol_name(s.protocol_name).get('session', None) + if session_ and '{' in session_: + # there was a marker for something we could provide from our seqinfo + # e.g. {date} + session_ = session_.format(**s._asdict()) + ses_markers.append(session_) + ses_markers = filter(bool, ses_markers) # only present ones session = None if ses_markers: From 366556a1891dcff0f51c309988162f5a7b946527 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 4 Apr 2017 16:23:30 -0400 Subject: [PATCH 140/181] ENH: decide on having motion correction done also using ImageType and add _rec-moco suffix --- bin/heudiconv | 7 +++++-- heuristics/dbic_bids.py | 23 ++++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 72afe291..7b0362e2 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -455,6 +455,9 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud except AttributeError: TE = -1 + image_type = tuple(dcminfo.ImageType) + motion_corrected = 'MoCo' in dcminfo.SeriesDescription \ + or 'MOCO' in image_type info = SeqInfo( total, os.path.split(files[0])[1], @@ -463,14 +466,14 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud size[0], size[1], size[2], size[3], TR, TE, dcminfo.ProtocolName, - 'MoCo' in dcminfo.SeriesDescription, + motion_corrected, # New ones by us 'derived' in [x.lower() for x in dcminfo.get('ImageType', [])], dcminfo.get('PatientID'), dcminfo.get('StudyDescription'), dcminfo.get('ReferringPhysicianName'), dcminfo.get('SeriesDescription'), - tuple(dcminfo.ImageType), + image_type, accession_number, # For demographics to populate BIDS participants.tsv dcminfo.get('PatientsAge'), diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 97e84f4f..6a1ff1fd 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -38,13 +38,16 @@ [ #('anat-scout.*', 'anat-scout_ses-{date}'), ('anat-scout.*', 'anat-scout'), - ('BOLD_p2_s4(_3\.5mm)?', 'func_run+_task-rest_acq-p2s4'), - ('BOLD_p2_noprescannormalize', 'func_run+_task-rest_acq-p2noprescannormalize'), - ('BOLD_p2', 'func_run+_task-rest_acq-p2'), - ('BOLD_', 'func_run+_task-rest'), - ('DTI_30_p2_s4(_3\.5mm)?', 'dwi_run+_acq-30p2s4'), - ('DTI_30_p2', 'dwi_run+_acq-30p2'), - ('_p2_s4(_3\.5mm)?', '_acq-p2s4'), + ('BOLD_p2_s4_3\.5mm', 'func_task-rest_acq-p2-s4-3.5mm'), + ('BOLD_p2_s4', 'func_task-rest_acq-p2-s4'), + ('BOLD_p2_noprescannormalize', 'func-bold_task-rest_acq-p2noprescannormalize'), + ('BOLD_p2', 'func-bold_task-rest_acq-p2'), + ('BOLD_', 'func_task-rest'), + ('DTI_30_p2_s4_3\.5mm', 'dwi_acq-DTI-30-p2-s4-3.5mm'), + ('DTI_30_p2_s4', 'dwi_acq-DTI-30-p2-s4'), + ('DTI_30_p2', 'dwi_acq-DTI-30-p2'), + ('_p2_s4_3\.5mm', '_acq-p2-s4-3.5mm'), + ('_p2_s4', '_acq-p2-s4'), ('_p2', '_acq-p2'), ], '9d148e2a05f782273f6343507733309d': @@ -387,9 +390,15 @@ def infotodict(seqinfo): # if there is no _run -- no run label addded run_label = None + if s.is_motion_corrected and 'rec-' in regd.get('bids', ''): + raise NotImplementedError("want to add _acq-moco but there is _acq- already") + suffix_parts = [ None if not regd.get('task') else "task-%s" % regd['task'], None if not regd.get('acq') else "acq-%s" % regd['acq'], + # But we want to add an indicator in case it was motion corrected + # in the magnet. ref sample /2017/01/03/qa + None if not s.is_motion_corrected else 'rec-moco', regd.get('bids'), run_label, seqtype_label, From 2d68d1cd33c5d15fe66a4f4736c9f3d3a6e71267 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 6 Apr 2017 13:24:25 -0400 Subject: [PATCH 141/181] BF+ENH: fix seqinfo entries twice -- once first when figuring out session. allow for _ses+ to increment over previously existing outputs --- bin/heudiconv | 3 +- heuristics/dbic_bids.py | 85 +++++++++++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 25 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 7b0362e2..5c040d9b 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1022,7 +1022,7 @@ and there was a screw up for that A in that one though """ - if global_options['overwrite'] and lexists(scaninfo): + if global_options['overwrite'] and os.path.lexists(scaninfo): # TODO: handle annexed file case if not os.path.islink(scaninfo): os.chmod(scaninfo, 0o0664) @@ -1087,6 +1087,7 @@ def convert_dicoms(sid, # Figure out where to stick supplemental info dicoms idir = os.path.join(outdir, '.heudiconv', sid) + # THAT IS WHERE WE MUST KNOW ABOUT SESSION ALREADY! if is_bids and ses: idir = os.path.join(idir, 'ses-%s' % str(ses)) # yoh: in my case if idir exists, it means that that study/subject/session diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 6a1ff1fd..41741504 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -3,6 +3,8 @@ from collections import OrderedDict import hashlib +from glob import glob + import logging lgr = logging.getLogger('heudiconv') @@ -92,11 +94,16 @@ ('_mSense', '_acq-mSense'), ('_p1_sms4_2.5mm', '_acq-p1-sms4-2.5mm'), ('_p1_sms4_3mm', '_acq-p1-sms4-3mm'), - ], + ], 'd160113cf5ea8c5d0cbbbe14ef625e76': [ ('_run0', '_run-0'), - ], + ], + '1bd62e10672fe0b435a9aa8d75b45425': + [ + # need to add incrementing session -- study should have 2 + ('scout_run\+$', 'scout_run+_ses+'), + ], } keys2replace = ['protocol_name', 'series_description'] @@ -181,8 +188,15 @@ def md5sum(string): return m.hexdigest() +def get_study_hash(seqinfo): + # XXX: ad hoc hack + study_description = get_unique(seqinfo, 'study_description') + return md5sum(study_description) + + def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): - """Function that adds cancelme_ to known bad runs which were forgotten""" + """Function that adds cancelme_ to known bad runs which were forgotten + """ accession_number = get_unique(seqinfo, 'accession_number') if accession_number in accession2run: lgr.info("Considering some runs possibly marked to be " @@ -200,15 +214,15 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): - """Ad-hoc fixup for existing protocols""" - # get name of the study to check if we know how to fix it up - study_descr = get_unique(seqinfo, 'study_description') - study_descr_hash = md5sum(study_descr) + """Ad-hoc fixup for existing protocols + """ + study_hash = get_study_hash(seqinfo) + + if study_hash not in subsdict: + raise ValueError("I don't know how to fix {0}".format(study_hash)) - if study_descr_hash not in subsdict: - raise ValueError("I don't know how to fix {0}".format(study_descr)) # need to replace both protocol_name series_description - substitutions = subsdict[study_descr_hash] + substitutions = subsdict[study_hash] for i, s in enumerate(seqinfo): fixed_kwargs = dict() for key in keys: @@ -223,13 +237,24 @@ def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): return seqinfo +def fix_seqinfo(seqinfo): + """Just a helper on top of both fixers + """ + # add cancelme to known bad runs + seqinfo = fix_canceled_runs(seqinfo) + study_hash = get_study_hash(seqinfo) + if study_hash in protocols2fix: + lgr.info("Fixing up protocol for {0}".format(study_hash)) + seqinfo = fix_dbic_protocol(seqinfo) + return seqinfo + + def ls(study_session, seqinfo): """Additional ls output for a seqinfo""" #assert len(sequences) <= 1 # expecting only a single study here #seqinfo = sequences.keys()[0] - study_descr = get_unique(seqinfo, 'study_description') - study_descr_hash = md5sum(study_descr) - return ' study hash: %s' % study_descr_hash + return ' study hash: %s' % get_study_hash(seqinfo) + # XXX we killed session indicator! what should we do now?!!! # WE DON:T NEED IT -- it will be provided into conversion_info as `session` @@ -245,16 +270,8 @@ def infotodict(seqinfo): subindex: sub index within group session: scan index for longitudinal acq """ - # XXX: ad hoc hack - study_description = get_unique(seqinfo, 'study_description') - - # add cancelme to known bad runs - seqinfo = fix_canceled_runs(seqinfo) - - if md5sum(study_description) in protocols2fix: - lgr.info("Fixing up protocol for {0}".format(study_description)) - seqinfo = fix_dbic_protocol(seqinfo) + seqinfo = fix_seqinfo(seqinfo) lgr.info("Processing %d seqinfo entries", len(seqinfo)) and_dicom = ('dicom', 'nii.gz') @@ -510,6 +527,11 @@ def infotoids(seqinfos, outdir): # So might need to go through parse_dbic_protocol_name(s.protocol_name) # to figure out presence of sessions. ses_markers = [] + + # there might be fixups needed so we could deduce session etc + # this copy is not replacing original one, so the same fix_seqinfo + # might be called later + seqinfos = fix_seqinfo(seqinfos) for s in seqinfos: if s.is_derived: continue @@ -548,8 +570,23 @@ def infotoids(seqinfos, outdir): # out initial one if sign ones, and should make use of knowing # outdir #raise NotImplementedError() - # Let's be lazy for now just to get somewhere - session = '001' + # we need to look at what sessions we already have + sessions_dir = os.path.join(outdir, locator, 'sub-' + subject) + prior_sessions = sorted(glob(os.path.join(sessions_dir, 'ses-*'))) + # TODO: more complicated logic + # For now just increment session if + and keep the same number if = + # and otherwise just give it 001 + # Note: this disables our safety blanket which would refuse to process + # what was already processed before since it would try to override, + # BUT there is no other way besides only if heudiconv was storing + # its info based on some UID + if ses_markers == ['+']: + session = '%03d' % (len(prior_sessions) + 1) + elif ses_markers == ['=']: + session = os.path.basename(prior_sessions[-1])[4:] if prior_sessions else '001' + else: + session = '001' + if study_description_hash == '9d148e2a05f782273f6343507733309d': session = 'siemens1' From f0c43e5e7fb42a5f350ecddc42c3060c2b986ecb Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 6 Apr 2017 13:52:39 -0400 Subject: [PATCH 142/181] RF/BF: to use datalad >= 0.5.1 where we need to add stuff explicitly --- bin/heudiconv | 24 +++++++++++++++++------- tests/test_main.py | 4 +++- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 5c040d9b..e665c2d9 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1487,6 +1487,9 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): from datalad.api import Dataset from datalad.support.annexrepo import AnnexRepo + from datalad.support.external_versions import external_versions + assert external_versions['datalad'] >= '0.5.1', "Need datalad >= 0.5.1" + studyrelpath = os.path.relpath(studydir, topdir) assert not studyrelpath.startswith(os.path.pardir) # so we are under # now we need to test and initiate a DataLad dataset all along the path @@ -1501,7 +1504,6 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): # would require annex > 20161018 for correct operation on annex v6 ds_ = create(curdir_, dataset=superds, force=True, - if_dirty='ignore', # see https://github.com/datalad/datalad/issues/1016 no_annex=True, # need to add .gitattributes first anyways shared_access='all', annex_version=6) @@ -1524,12 +1526,20 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): ds.config.add('annex.thin', 'true', where='local') # initialize annex there if not yet initialized AnnexRepo(ds.path, init=True) - # Let's make it a - dstop = Dataset(topdir) - # ideally we should save from within the subdataset, - # but https://github.com/datalad/datalad/pull/987 is not yet there - # so for now saving everything - dstop.save(message=msg, auto_add_changes=True, recursive=True) + # ds might have memories of having ds.repo GitRepo + superds = None + del ds + ds = Dataset(studydir) + # ha -- TODO: we actually want 'auto_add_files' feature in datalad ;) + add_files = filter(os.path.exists, + glob(os.path.join(ds.path, '*')) + + [os.path.join(ds.path, f) + for f in ('.datalad', '.heudiconv', '.gitattributes') + ]) + if add_files: + ds.add(add_files, recursive=True, save=False) + ds.save(message=msg, recursive=True, super_datasets=True) + assert not ds.repo.dirty # TODO: they are still appearing as native annex symlinked beasts """ diff --git a/tests/test_main.py b/tests/test_main.py index 82da08a4..089b6823 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -87,7 +87,9 @@ def test_prepare_for_datalad(tmpdir): # the last one should have been the study target_files = { - '.gitattributes', '.datalad/config', 'dataset_description.json', + '.gitattributes', + '.datalad/config', '.datalad/.gitattributes', + 'dataset_description.json', 'CHANGES', 'README'} assert set(ds.repo.get_indexed_files()) == target_files # and all are under git From 9b8600d4b48884659f2e21aeceb195ffb5eb907b Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 6 Apr 2017 14:13:24 -0400 Subject: [PATCH 143/181] ENH: simplify a bit adding files --- bin/heudiconv | 17 +++++++++-------- tests/test_main.py | 1 + 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index e665c2d9..759631a6 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1530,14 +1530,15 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): superds = None del ds ds = Dataset(studydir) - # ha -- TODO: we actually want 'auto_add_files' feature in datalad ;) - add_files = filter(os.path.exists, - glob(os.path.join(ds.path, '*')) - + [os.path.join(ds.path, f) - for f in ('.datalad', '.heudiconv', '.gitattributes') - ]) - if add_files: - ds.add(add_files, recursive=True, save=False) + # Add doesn't have all the options of save such as msg and supers + ds.add('.gitattributes', to_git=True, save=False) + if os.path.lexists(os.path.join(ds.path, '.heudiconv')): + ds.add('.heudiconv', to_git=True, save=False) + ds.add('.', recursive=True, save=False, + # not in effect! ? + #annex_add_opts=['--include-dotfiles'] + ) + ds.save(message=msg, recursive=True, super_datasets=True) assert not ds.repo.dirty diff --git a/tests/test_main.py b/tests/test_main.py index 089b6823..1d31f13a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -95,6 +95,7 @@ def test_prepare_for_datalad(tmpdir): # and all are under git for f in target_files: assert not ds.repo.is_under_annex(f) + assert not ds.repo.is_under_annex('.gitattributes') def test_json_dumps_pretty(): pretty = heudiconv.json_dumps_pretty From 69caae3d1dbaad2a6da6b4ace2a31990374f55d7 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Mon, 24 Apr 2017 19:09:42 +0000 Subject: [PATCH 144/181] BF: few more ignores and renames --- heuristics/dbic_bids.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 41741504..0be35898 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -24,6 +24,7 @@ 'A000130': ['^15-'], 'A000137': ['^9-', '^11-'], 'A000297': ['^12-'], + 'A000326': ['^15-'], 'A000376': ['^15-'], 'A000384': ['^8-', '^11-'], 'A000467': ['^15-'], @@ -104,6 +105,13 @@ # need to add incrementing session -- study should have 2 ('scout_run\+$', 'scout_run+_ses+'), ], + 'da218a66de902adb3ad9407d514e3639': + [ + # those sequences renamed later to include DTI- in their acq- + # so fot consistency + ('hardi_64', 'dwi_acq-DTI-hardi64'), + ('acq-hardi', 'acq-DTI-hardi'), + ], } keys2replace = ['protocol_name', 'series_description'] From 993cd43ec77e1a4285cda220b11c6710f29dd9e7 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Mon, 24 Apr 2017 19:12:50 +0000 Subject: [PATCH 145/181] ENH/BF(?): provide outdir into get_study_sessions was found uncommitted... but ATM was having problems processing qa with _ses-{date} --- bin/heudiconv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index 759631a6..a72652f3 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1611,7 +1611,7 @@ def _main(args): for f in files_opt: study_sessions = get_study_sessions( dicom_dir_template, [f], - heuristic, None, session, subjs, grouping=grouping) + heuristic, outdir, session, subjs, grouping=grouping) print(f) for study_session, sequences in study_sessions.items(): suf = '' From e5b3a70a323a979ae1fed4827ce694831580f627 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 24 Apr 2017 12:23:25 -0700 Subject: [PATCH 146/181] BF: do not 'fix' anat-scout sequence for qa so we retain the date session marker --- heuristics/dbic_bids.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 0be35898..a064bd8e 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -40,7 +40,8 @@ '43b67d9139e8c7274578b7451ab21123': [ #('anat-scout.*', 'anat-scout_ses-{date}'), - ('anat-scout.*', 'anat-scout'), + # do not change it so we retain _ses-{date} + #('anat-scout.*', 'anat-scout'), ('BOLD_p2_s4_3\.5mm', 'func_task-rest_acq-p2-s4-3.5mm'), ('BOLD_p2_s4', 'func_task-rest_acq-p2-s4'), ('BOLD_p2_noprescannormalize', 'func-bold_task-rest_acq-p2noprescannormalize'), From f30dae90c2e8ceffb85e6e31d27b08aa69034b29 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Mon, 24 Apr 2017 22:25:26 +0000 Subject: [PATCH 147/181] Do not regenerate dataset_description.json --- bin/heudiconv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index a72652f3..ea233938 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1374,7 +1374,7 @@ def populate_bids_templates(path, defaults={}): # dataset descriptor lgr.info("Populating template files under %s", path) descriptor = opj(path, 'dataset_description.json') - if True: # not exists(descriptor): + if not exists(descriptor): save_json(descriptor, ordereddict([ ('Name', "TODO: name of the dataset"), From f64dd8ff35e096d265dbbb887a4645fdd883438c Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Fri, 28 Apr 2017 00:35:47 +0000 Subject: [PATCH 148/181] RF: centralized access to study_description so we could patch easily if needed --- heuristics/dbic_bids.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index a064bd8e..4573ec10 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -196,11 +196,14 @@ def md5sum(string): m = hashlib.md5(string.encode()) return m.hexdigest() +def get_study_description(seqinfo): + # Centralized so we could fix/override + v = get_unique(seqinfo, 'study_description') + return v def get_study_hash(seqinfo): # XXX: ad hoc hack - study_description = get_unique(seqinfo, 'study_description') - return md5sum(study_description) + return md5sum(get_study_description(seqinfo)) def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): @@ -385,7 +388,7 @@ def infotodict(seqinfo): # XXX if we have a known earlier study, we need to always # increase the run counter for phasediff because magnitudes # were not acquired - if md5sum(s.study_description) == '9d148e2a05f782273f6343507733309d': + if get_study_hash([s]) == '9d148e2a05f782273f6343507733309d': current_run += 1 else: raise RuntimeError( @@ -514,7 +517,7 @@ def get_unique(seqinfos, attr): def infotoids(seqinfos, outdir): # decide on subjid and session based on patient_id lgr.info("Processing sequence infos to deduce study/session") - study_description = get_unique(seqinfos, 'study_description') + study_description = get_study_description(seqinfos) study_description_hash = md5sum(study_description) subject = fixup_subjectid(get_unique(seqinfos, 'patient_id')) # TODO: fix up subject id if missing some 0s From edf305f1440b63b2d752b049bc04a81eb46b1e64 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 4 May 2017 23:50:21 -0400 Subject: [PATCH 149/181] ENH: primarily notes planning --- bin/heudiconv | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/bin/heudiconv b/bin/heudiconv index ea233938..ec042437 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -931,6 +931,13 @@ def tuneup_bids_json_files(json_files): # Harmonize generic .json formatting for jsonfile in json_files: json_ = json.load(open(jsonfile)) + # sanitize + for f in ['AcquisitionDateTime', 'AcquisitionDate']: + json_.pop(f, None) + if 'Date' in str(json_): + # Let's hope no word 'Date' comes within a study name or smth like + # that + raise ValueError("There must be no dates in .json sidecar") json.dump(json_, open(jsonfile, 'w'), indent=2) # Load the beast @@ -1628,6 +1635,8 @@ def _main(args): f, getattr(heuristic, 'DEFAULT_FIELDS', {}) ) + elif args.command == 'sanitize-jsons': + tuneup_bids_json_files(files_opt) else: raise ValueError("Unknown command %s", args.command) return @@ -1857,6 +1866,11 @@ def main(argv=None): parser = get_parser() args = parser.parse_args(argv) + # TODO: assign distribution-restrictions=sensitive + + # TODO(privat): entirety of .heudiconv/ should get under annex and be marked + # as sensitive since date incorporated in the path might leak + # TODO: deprecate dicom_dir_template in favor of --files-templated or # smth like that which could take {subject} {session} ... and process # files argument(s) correspondingly before passing into group_dicoms_into_seqinfos From 65ff1b2d4df367151bd246fda1c939c82beb77f5 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Sat, 13 May 2017 02:53:58 +0000 Subject: [PATCH 150/181] more ignores for our heuristic validator --- heuristics/dbic_bids_validator.cfg | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/heuristics/dbic_bids_validator.cfg b/heuristics/dbic_bids_validator.cfg index 5a05e5fb..103ab591 100644 --- a/heuristics/dbic_bids_validator.cfg +++ b/heuristics/dbic_bids_validator.cfg @@ -1,7 +1,13 @@ { - "ignore": [], + "ignore": [ + "TOTAL_READOUT_TIME_NOT_DEFINED" + ], "warn": [], "error": [], - "ignoredFiles": ["/.heudiconv/*", "/.heudiconv/*/*", "/.heudiconv/*/*/*", "/.heudiconv/*/*/*/*", "/.git*", "/.datalad/*"] + "ignoredFiles": [ + "/.heudiconv/*", "/.heudiconv/*/*", "/.heudiconv/*/*/*", "/.heudiconv/*/*/*/*", + "/.git*", "/.datalad/*", + "/sub*/ses*/*/*__dup*", "/sub*/*/*__dup*" + ] } From f4f0d2730b1e32ab63668a7b28a0e44ecef93e9f Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Wed, 17 May 2017 17:59:25 +0000 Subject: [PATCH 151/181] adjusted validator config to ignore .datalad/.gitattributes file --- heuristics/dbic_bids_validator.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/heuristics/dbic_bids_validator.cfg b/heuristics/dbic_bids_validator.cfg index 103ab591..0da468b5 100644 --- a/heuristics/dbic_bids_validator.cfg +++ b/heuristics/dbic_bids_validator.cfg @@ -6,7 +6,8 @@ "error": [], "ignoredFiles": [ "/.heudiconv/*", "/.heudiconv/*/*", "/.heudiconv/*/*/*", "/.heudiconv/*/*/*/*", - "/.git*", "/.datalad/*", + "/.git*", + "/.datalad/*", "/.datalad/.*", "/sub*/ses*/*/*__dup*", "/sub*/*/*__dup*" ] } From 1ea21de95b1056ade3a1452afc75f82eaae633a1 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 17 May 2017 14:49:30 -0400 Subject: [PATCH 152/181] one more note --- bin/heudiconv | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index ec042437..ed03280b 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -931,9 +931,11 @@ def tuneup_bids_json_files(json_files): # Harmonize generic .json formatting for jsonfile in json_files: json_ = json.load(open(jsonfile)) - # sanitize + # sanitize! for f in ['AcquisitionDateTime', 'AcquisitionDate']: json_.pop(f, None) + # TODO: should actually be placed into series file which must + # go under annex (not under git) and marked as sensitive if 'Date' in str(json_): # Let's hope no word 'Date' comes within a study name or smth like # that From dad3ebf0b0b295af8183a8a4b6b442fda79c0b82 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 27 May 2017 16:01:52 -0400 Subject: [PATCH 153/181] fix: change from series_number to series_id --- README.md | 2 +- heuristics/banda-bids.py | 10 +++++----- heuristics/bids_with_ses.py | 26 +++++++++++++------------- heuristics/cmrr_heuristic.py | 32 ++++++++++++++++---------------- heuristics/convertall.py | 4 ++-- heuristics/example.py | 2 +- 6 files changed, 38 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index f35db48a..e85f53dc 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ function `infotodict`, which takes a single argument `seqinfo`. * total_files_till_now * example_dcm_file -* series_number +* series_id * dcm_dir_name * unspecified2 * unspecified3 diff --git a/heuristics/banda-bids.py b/heuristics/banda-bids.py index 92d8391a..946c33b7 100644 --- a/heuristics/banda-bids.py +++ b/heuristics/banda-bids.py @@ -38,9 +38,9 @@ def infotodict(seqinfo): for idx, s in enumerate(seqinfo): # T1 and T2 scans if (s.dim3 == 208) and (s.dim4 == 1) and ('T1w' in s.protocol_name): - info[t1] = [s.series_number] + info[t1] = [s.series_id] if (s.dim3 == 208) and ('T2w' in s.protocol_name): - info[t2] = [s.series_number] + info[t2] = [s.series_id] # diffusion scans if ('dMRI_dir9' in s.protocol_name): key = None @@ -49,7 +49,7 @@ def infotodict(seqinfo): elif (s.dim4 == 1) and ('SBRef' in s.series_description): key = dwi_sbref if key: - info[key].append({'item': s.series_number}) + info[key].append({'item': s.series_id}) # functional scans if ('fMRI' in s.protocol_name): tasktype = s.protocol_name.split('fMRI')[1].split('_')[1] @@ -65,10 +65,10 @@ def infotodict(seqinfo): if 'conflict' in tasktype: key = conflict_sbref if 'gambling' in tasktype: key = gamble_sbref if key: - info[key].append({'item': s.series_number}) + info[key].append({'item': s.series_id}) if (s.dim4 == 3) and ('SpinEchoFieldMap' in s.protocol_name): dirtype = s.protocol_name.split('_')[-1] - info[fmap].append({'item': s.series_number, 'dir': dirtype}) + info[fmap].append({'item': s.series_id, 'dir': dirtype}) # You can even put checks in place for your protocol msg = [] diff --git a/heuristics/bids_with_ses.py b/heuristics/bids_with_ses.py index 1972b51b..e8dc84d8 100644 --- a/heuristics/bids_with_ses.py +++ b/heuristics/bids_with_ses.py @@ -37,38 +37,38 @@ def infotodict(seqinfo): last_run = len(seqinfo) for s in seqinfo: if (s.dim3 == 176 or s.dim3 == 352) and (s.dim4 == 1) and ('MEMPRAGE' in s.protocol_name): - info[t1] = [s.series_number] + info[t1] = [s.series_id] elif (s.dim4 == 1) and ('MEMPRAGE' in s.protocol_name): - info[t1] = [s.series_number] + info[t1] = [s.series_id] elif (s.dim3 == 176 or s.dim3 == 352) and (s.dim4 == 1) and ('T2_SPACE' in s.protocol_name): - info[t2] = [s.series_number] + info[t2] = [s.series_id] elif ('field_mapping_diffusion' in s.protocol_name): - info[fm_diff].append([s.series_number]) + info[fm_diff].append([s.series_id]) elif (s.dim4 >= 70) and ('DIFFUSION_HighRes_AP' in s.protocol_name): - info[dwi_ap].append([s.series_number]) + info[dwi_ap].append([s.series_id]) elif ('DIFFUSION_HighRes_PA' in s.protocol_name): - info[dwi_pa].append([s.series_number]) + info[dwi_pa].append([s.series_id]) elif ('field_mapping_resting' in s.protocol_name): - info[fm_rest].append([s.series_number]) + info[fm_rest].append([s.series_id]) elif (s.dim4 == 144) and ('resting' in s.protocol_name): if not s.is_motion_corrected: - info[rs].append([(s.series_number)]) + info[rs].append([(s.series_id)]) elif (s.dim4 == 183 or s.dim4 == 366) and ('localizer' in s.protocol_name): if not s.is_motion_corrected: - info[boldt1].append([s.series_number]) + info[boldt1].append([s.series_id]) elif (s.dim4 == 227 or s.dim4 == 454) and ('transfer1' in s.protocol_name): if not s.is_motion_corrected: - info[boldt2].append([s.series_number]) + info[boldt2].append([s.series_id]) elif (s.dim4 == 227 or s.dim4 == 454) and ('transfer2' in s.protocol_name): if not s.is_motion_corrected: - info[boldt3].append([s.series_number]) + info[boldt3].append([s.series_id]) elif (('run1' in s.protocol_name) or ('run6' in s.protocol_name)) and (s.dim4 == 159): if not s.is_motion_corrected: - info[nofb_task].append([s.series_number]) + info[nofb_task].append([s.series_id]) elif (('run2' in s.protocol_name) or ('run3' in s.protocol_name) or ('run4' in s.protocol_name) or ('run5' in s.protocol_name)) and (s.dim4 == 159): if not s.is_motion_corrected: - info[fb_task].append([s.series_number]) + info[fb_task].append([s.series_id]) else: pass return info diff --git a/heuristics/cmrr_heuristic.py b/heuristics/cmrr_heuristic.py index 79c370c1..449b5473 100644 --- a/heuristics/cmrr_heuristic.py +++ b/heuristics/cmrr_heuristic.py @@ -31,41 +31,41 @@ def infotodict(seqinfo): for idx, s in enumerate(seqinfo): if (s.dim3 == 208) and (s.dim4 == 1) and ('T1w' in s.protocol_name): - info[t1] = [s.series_number] + info[t1] = [s.series_id] if (s.dim3 == 208) and ('T2w' in s.protocol_name): - info[t2] = [s.series_number] + info[t2] = [s.series_id] if (s.dim4 >= 99) and (('dMRI_dir98_AP' in s.protocol_name) or ('dMRI_dir99_AP' in s.protocol_name)): acq = s.protocol_name.split('dMRI_')[1].split('_')[0] + 'AP' - info[dwi].append({'item': s.series_number, 'acq': acq}) + info[dwi].append({'item': s.series_id, 'acq': acq}) if (s.dim4 >= 99) and (('dMRI_dir98_PA' in s.protocol_name) or ('dMRI_dir99_PA' in s.protocol_name)): acq = s.protocol_name.split('dMRI_')[1].split('_')[0] + 'PA' - info[dwi].append({'item': s.series_number, 'acq': acq}) + info[dwi].append({'item': s.series_id, 'acq': acq}) if (s.dim4 == 1) and (('dMRI_dir98_AP' in s.protocol_name) or ('dMRI_dir99_AP' in s.protocol_name)): acq = s.protocol_name.split('dMRI_')[1].split('_')[0] - info[fmap_dwi].append({'item': s.series_number, 'dir': 'AP', 'acq': acq}) + info[fmap_dwi].append({'item': s.series_id, 'dir': 'AP', 'acq': acq}) if (s.dim4 == 1) and (('dMRI_dir98_PA' in s.protocol_name) or ('dMRI_dir99_PA' in s.protocol_name)): acq = s.protocol_name.split('dMRI_')[1].split('_')[0] - info[fmap_dwi].append({'item': s.series_number, 'dir': 'PA', 'acq': acq}) + info[fmap_dwi].append({'item': s.series_id, 'dir': 'PA', 'acq': acq}) if (s.dim4 == 420) and ('rfMRI_REST_AP' in s.protocol_name): - info[rest].append({'item': s.series_number, 'acq': 'AP'}) + info[rest].append({'item': s.series_id, 'acq': 'AP'}) if (s.dim4 == 420) and ('rfMRI_REST_PA' in s.protocol_name): - info[rest].append({'item': s.series_number, 'acq': 'PA'}) + info[rest].append({'item': s.series_id, 'acq': 'PA'}) if (s.dim4 == 1) and ('rfMRI_REST_AP' in s.protocol_name): if seqinfo[idx + 1][9] != 420: continue - info[fmap_rest].append({'item': s.series_number, 'dir': 'AP', 'acq': ''}) + info[fmap_rest].append({'item': s.series_id, 'dir': 'AP', 'acq': ''}) if (s.dim4 == 1) and ('rfMRI_REST_PA' in s.protocol_name): - info[fmap_rest].append({'item': s.series_number, 'dir': 'PA', 'acq': ''}) + info[fmap_rest].append({'item': s.series_id, 'dir': 'PA', 'acq': ''}) if (s.dim4 == 346) and ('tfMRI_faceMatching_AP' in s.protocol_name): - info[face].append({'item': s.series_number, 'acq': 'AP'}) + info[face].append({'item': s.series_id, 'acq': 'AP'}) if (s.dim4 == 346) and ('tfMRI_faceMatching_PA' in s.protocol_name): - info[face].append({'item': s.series_number, 'acq': 'PA'}) + info[face].append({'item': s.series_id, 'acq': 'PA'}) if (s.dim4 == 288) and ('tfMRI_conflict_AP' in s.protocol_name): - info[conflict].append({'item': s.series_number, 'acq': 'AP'}) + info[conflict].append({'item': s.series_id, 'acq': 'AP'}) if (s.dim4 == 288) and ('tfMRI_conflict_PA' in s.protocol_name): - info[conflict].append({'item': s.series_number, 'acq': 'PA'}) + info[conflict].append({'item': s.series_id, 'acq': 'PA'}) if (s.dim4 == 223) and ('tfMRI_gambling_AP' in (s.protocol_name)): - info[gamble].append({'item': s.series_number, 'acq': 'AP'}) + info[gamble].append({'item': s.series_id, 'acq': 'AP'}) if (s.dim4 == 223) and ('tfMRI_gambling_PA' in s.protocol_name): - info[gamble].append({'item': s.series_number, 'acq': 'PA'}) + info[gamble].append({'item': s.series_id, 'acq': 'PA'}) return info diff --git a/heuristics/convertall.py b/heuristics/convertall.py index 0b264a81..eb315e49 100644 --- a/heuristics/convertall.py +++ b/heuristics/convertall.py @@ -28,7 +28,7 @@ def infotodict(seqinfo): * total_files_till_now * example_dcm_file - * series_number + * series_id * dcm_dir_name * unspecified2 * unspecified3 @@ -48,5 +48,5 @@ def infotodict(seqinfo): * image_type """ - info[data].append(s.series_number) + info[data].append(s.series_id) return info diff --git a/heuristics/example.py b/heuristics/example.py index 324d875b..5c2bc5ca 100644 --- a/heuristics/example.py +++ b/heuristics/example.py @@ -86,4 +86,4 @@ def infotodict(seqinfo): info[fmrest].append(s[2]) else: pass - return info \ No newline at end of file + return info From e4b1eb8a6dc15b5924368572f2d40d5a77f2e0d8 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 27 May 2017 16:02:23 -0400 Subject: [PATCH 154/181] fix: change template to use {subject} format --- bin/heudiconv | 13 ++++++------- tests/test_heuristics.py | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 805f12b7..9039f2ca 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -119,7 +119,6 @@ StudySessionInfo = namedtuple( class TempDirs(object): """A helper to centralize handling and cleanup of dirs""" - def __init__(self): self.dirs = [] self.exists = os.path.exists @@ -480,8 +479,8 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud image_type, accession_number, # For demographics to populate BIDS participants.tsv - dcminfo.get('PatientsAge'), - dcminfo.get('PatientsSex'), + dcminfo.get('PatientAge'), + dcminfo.get('PatientSex'), dcminfo.get('AcquisitionDate'), ) # candidates @@ -1287,12 +1286,12 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, assert not files_opt # see above TODO assert sids # expand the input template - if '%s' not in dicom_dir_template: + if '{subject}' not in dicom_dir_template: raise ValueError( - "dicom dir template must have '%s' as a placeholder for a " + "dicom dir template must have {subject} as a placeholder for a " "subject id. Got %r" % dicom_dir_template) for sid in sids: - sdir = dicom_dir_template % sid + sdir = dicom_dir_template.format(subject=sid, session=session) # and see what matches files = sorted(glob(sdir)) for session_, files_ in get_extracted_dicoms(files): @@ -1784,7 +1783,7 @@ def get_parser(): """ Example: - heudiconv -d rawdata/%s -o . -f + heudiconv -d rawdata/{subject} -o . -f heuristic.py -s s1 s2 s3 """)) diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index 212b3c88..d52f259b 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -12,7 +12,7 @@ def test_smoke_converall(tmpdir): heudiconv.main( ("-f heuristics/convertall.py -c dcm2niix -o %s -b --datalad " - "-s fmap_acq-3mm -d tests/data/%%s/*" % tmpdir).split(' ') + "-s fmap_acq-3mm -d tests/data/{subject}/*" % tmpdir).split(' ') ) @@ -20,7 +20,7 @@ def test_smoke_converall(tmpdir): @pytest.mark.parametrize( 'invocation', [ "tests/data", # our new way with automated groupping - "-d tests/data/%s/* -s 01-fmap_acq-3mm" # "old" way specifying subject + "-d tests/data/{subject}/* -s 01-fmap_acq-3mm" # "old" way specifying subject # should produce the same results ]) def test_dbic_bids_largely_smoke(tmpdir, heuristic, invocation): From c8b6c65c9103ea36a3ba58fa8e4bff50fa6e4ca1 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 27 May 2017 16:02:39 -0400 Subject: [PATCH 155/181] ref: update Dockerfile --- Dockerfile | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4dfdc172..27dd088e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,10 +2,31 @@ FROM continuumio/miniconda MAINTAINER -RUN apt-get update && apt-get upgrade -y && apt-get install -y g++ && apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y -RUN cd /tmp && git clone https://github.com/neurolabusc/dcm2niix.git && cd dcm2niix/console/ && git checkout e262240bb27e8f4c9440d6b1a97dfd98ef0f9939 && g++ -O3 -I. main_console.cpp nii_dicom.cpp jpg_0XC3.cpp ujpeg.cpp nifti1_io_core.cpp nii_ortho.cpp nii_dicom_batch.cpp -o dcm2niix -DmyDisableOpenJPEG -DmyDisableJasper && cp dcm2niix /usr/local/bin/ -RUN conda install -y -c conda-forge nipype && pip install https://github.com/moloney/dcmstack/archive/c12d27d2c802d75a33ad70110124500a83e851ee.zip && pip install https://github.com/nipy/nipype/archive/dd1ed4f0d5735c69c1743f29875acf09d23a62e0.zip -RUN curl -O https://raw.githubusercontent.com/nipy/heudiconv/master/bin/heudiconv && chmod +x heudiconv && cp heudiconv /usr/local/bin/ -RUN curl -O https://raw.githubusercontent.com/nipy/heudiconv/master/heuristics/convertall.py && chmod +x convertall.py +RUN apt-get update && apt-get upgrade -y && \ + apt-get install -y g++ pkg-config make && \ + apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y +RUN (wget -O- http://neuro.debian.net/lists/jessie.us-nh.full | tee /etc/apt/sources.list.d/neurodebian.sources.list) && \ + apt-key adv --recv-keys --keyserver hkp://pool.sks-keyservers.net:80 0xA5D32F012649A5A9 && \ + apt-get update -qq && apt-get install -y git-annex-standalone && \ + apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y +RUN conda install -y -c conda-forge nipype && \ + conda install cmake && \ + pip install https://github.com/moloney/dcmstack/archive/c12d27d2c802d75a33ad70110124500a83e851ee.zip && \ + pip install datalad && \ + conda clean -tipsy && rm -rf ~/.pip/ +RUN cd /tmp && git clone https://github.com/neurolabusc/dcm2niix.git && \ + cd dcm2niix && \ + git checkout 19be415ba68c0bc52e13729c6de9e5ff9c3ab443 && \ + mkdir build && cd build && cmake -DBATCH_VERSION=ON .. && \ + make && make install && \ + cd / && rm -rf /tmp/dcm2niix + +COPY bin/heudiconv /usr/local/bin/heudiconv +RUN chmod +x /usr/local/bin/heudiconv +RUN mkdir /heuristics +COPY heuristics/convertall.py /heuristics +RUN chmod +x /heuristics/convertall.py +RUN git config --global user.email "test@docker.land" && \ + git config --global user.name "Docker Almighty" ENTRYPOINT ["/usr/local/bin/heudiconv"] From d663c98a37e545f2f7d82e68c6fd5809a5fb75f2 Mon Sep 17 00:00:00 2001 From: DBIC BIDS Team Date: Wed, 31 May 2017 14:26:24 +0000 Subject: [PATCH 156/181] ENH: do not crash if no magnitude file was generated so we cannot extract EchoTime --- bin/heudiconv | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index ea233938..0568c016 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -948,9 +948,13 @@ def tuneup_bids_json_files(json_files): # For now just save truthfully by loading magnitude files lgr.debug("Placing EchoTime fields into phasediff file") for i in 1, 2: - json_['EchoTime%d' % i] = \ - json.load(open(json_basename + '_magnitude%d.json' % i))[ - 'EchoTime'] + try: + json_['EchoTime%d' % i] = \ + json.load(open(json_basename + '_magnitude%d.json' % i))[ + 'EchoTime'] + except IOError as exc: + lgr.error("Failed to open magnitude file: %s", exc) + # might have been made R/O already os.chmod(json_phasediffname, 0o0664) json.dump(json_, open(json_phasediffname, 'w'), indent=2) From f55553d3712ac8e8cc57c1d365c5937ea8d73bfb Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Wed, 31 May 2017 22:53:14 +0100 Subject: [PATCH 157/181] TST: enable travis Python 3 tests Enable Python 3 tests on travis. These are current failing - see: https://travis-ci.org/matthew-brett/heudiconv/jobs/238120051 This is due to some Python 3 bugs in heudiconv, but there may be some problems with dcmstack also; although I have installed a WIP PR version of dcmstack for Python 3, it has not been much tested. --- .travis.yml | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8feb9c47..73457cf3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,12 +2,9 @@ language: python python: - 2.7 -# For PY3 testing we should use probably conda, so TODO -# since otherwise scipy fails to build ATM etc, see e.g. -# https://travis-ci.org/nipy/heudiconv/jobs/172172847 -# - 3.3 -# - 3.4 -# - 3.5 + - 3.4 + - 3.5 + - 3.6 cache: - apt @@ -19,18 +16,22 @@ env: - DATALAD_TESTS_SSH=1 before_install: - # for now even remove requirements.txt since dependencies aren't avail - - echo '' > requirements.txt - - pip install -r dev-requirements.txt - - pip install codecov # The ultimate one-liner setup for NeuroDebian repository - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - travis_retry sudo apt-get update -qq - - travis_retry sudo apt-get install git-annex-standalone python-dicom python-nipype python-nibabel dcm2niix - - pip install datalad git+git://github.com/moloney/dcmstack/ - # there is only dated nipype for precise from neurodebian. - # TEMP: configparser needs to be listed manually for now: https://github.com/nipy/nipype/pull/1697 - - pip install nipype configparser + - travis_retry sudo apt-get install git-annex-standalone dcm2niix + # Install in our own virtualenv + - python -m pip install --upgrade pip + - pip install --upgrade virtualenv + - virtualenv --python=python venv + - source venv/bin/activate + - python --version # just to check + # Have to install dcmstack from git + # This is a branch with changes for Python 3 + - pip install git+git://github.com/ghisvail/dcmstack@master + - pip install -r dev-requirements.txt + - pip install datalad + - pip install codecov pytest install: - git config --global user.email "test@travis.land" From 52a83eed2acd77d2c56f3cecda969d266d189446 Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Tue, 6 Jun 2017 13:09:19 +0100 Subject: [PATCH 158/181] BF: fix Python 3 errors causing test failures I don't know if the output is actually correct with Python 3, but at least the tests all pass with these changes, which I think are correct. --- bin/heudiconv | 42 ++++++++++++++++++++++++----------------- heuristics/dbic_bids.py | 15 +++++++++++---- tests/test_main.py | 4 ++-- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index d3682d56..80a91eed 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -25,8 +25,9 @@ from glob import glob import inspect import json import os -import shutil import sys +import re +import shutil from tempfile import mkdtemp import tarfile @@ -152,6 +153,21 @@ class TempDirs(object): tempdirs = TempDirs() +def _canonical_dumps(json_obj, **kwargs): + """ Dump `json_obj` to string, allowing for Python newline bug + + Runs ``json.dumps(json_obj, \*\*kwargs), then removes trailing whitespaces + added when doing indent in some Python versions. See + https://bugs.python.org/issue16333. Bug seems to be fixed in 3.4, for now + fixing manually not only for aestetics but also to guarantee the same + result across versions of Python. + """ + out = json.dumps(json_obj, **kwargs) + if 'indent' in kwargs: + out = out.replace(' \n', '\n') + return out + + def save_json(filename, data): """Save data to a json file @@ -163,14 +179,8 @@ def save_json(filename, data): Dictionary to save in json file. """ - # adds trailing whitespaces due to indent - # see https://bugs.python.org/issue16333 - # seems to be fixed in 3.4, for now fixing manually - # not only for aestetics but also to help guaranteeing the - # same result across versions of Python with open(filename, 'w') as fp: - json_str = json.dumps(data, sort_keys=True, indent=4) - fp.write(json_str.replace(' \n', '\n')) + fp.write(_canonical_dumps(data, sort_keys=True, indent=4)) def slim_down_info(j): @@ -199,8 +209,7 @@ def json_dumps_pretty(j, indent=2, sort_keys=True): If resultant structure differs from original -- throws exception """ - import re - js = json.dumps(j, indent=indent, sort_keys=sort_keys) + js = _canonical_dumps(j, indent=indent, sort_keys=sort_keys) # trim away \n and spaces between entries of numbers js_ = re.sub( '[\n ]+("?[-+.0-9e]+"?,?) *\n(?= *"?[-+.0-9e]+"?)', r' \1', @@ -242,7 +251,6 @@ def load_json(filename): # find_files utility copied/borrowed from DataLad (Copyright 2016 DataLad developers, MIT license) # -import re from os.path import sep as dirsep from os.path import curdir from os.path import join as opj @@ -326,7 +334,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud # which would differ already if flfilter: nfl_before = len(fl) - fl = filter(flfilter, fl) + fl = list(filter(flfilter, fl)) nfl_after = len(fl) lgr.info('Filtering out {0} dicoms based on their filename'.format( nfl_before-nfl_after)) @@ -453,7 +461,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud except AttributeError: TE = -1 try: - refphys = dcminfo.ReferringPhysicianName + refphys = str(dcminfo.ReferringPhysicianName) except AttributeError: refphys = '-' @@ -632,7 +640,6 @@ def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False, min_ meta = ds.NiftiWrapper(new_nii).meta_ext.to_json() meta_info = json.loads(meta) if bids_info: - import re if min_meta: meta_info = bids_info else: @@ -827,7 +834,7 @@ def convert(items, symlink=True, converter=None, convertnode = Node(Dcm2niix(), name='convert') convertnode.base_dir = tmpdir # need to be abspaths! - item_dicoms = map(os.path.abspath, item_dicoms) + item_dicoms = list(map(os.path.abspath, item_dicoms)) convertnode.inputs.source_names = item_dicoms if converter == 'dcm2nii': convertnode.inputs.gzip_output = outtype == 'nii.gz' @@ -1187,11 +1194,12 @@ def convert_dicoms(sid, if is_bids: if seqinfo: + keys = list(seqinfo) add_participant_record( anon_outdir, anon_sid, - seqinfo.keys()[0].patient_age, - seqinfo.keys()[0].patient_sex, + keys[0].patient_age, + keys[0].patient_sex, ) populate_bids_templates( anon_outdir, diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 4573ec10..372e2b6b 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -2,7 +2,6 @@ import re from collections import OrderedDict import hashlib - from glob import glob import logging @@ -134,6 +133,15 @@ } +def _delete_chars(from_str, deletechars): + """ Delete characters from string allowing for Python 2 / 3 difference + """ + try: + return from_str.translate(None, deletechars) + except TypeError: + return from_str.translate(str.maketrans('', '', deletechars)) + + def filter_dicom(dcmdata): """Return True if a DICOM dataset should be filtered out, else False""" return True if dcmdata.StudyInstanceUID in dicoms2skip else False @@ -480,8 +488,7 @@ def get_dups_marked(info): # were "cancelled" info = info.copy() dup_id = 0 - for template in info: - series_ids = info[template] + for template, series_ids in list(info.items()): if len(series_ids) > 1: lgr.warning("Detected %d duplicated run(s) for template %s: %s", len(series_ids) - 1, template[0], series_ids[:-1]) @@ -615,7 +622,7 @@ def infotoids(seqinfos, outdir): def sanitize_str(value): """Remove illegal characters for BIDS from task/acq/etc..""" - return value.translate(None, '#!@$%^&.,:;_-') + return _delete_chars(value, '#!@$%^&.,:;_-') def parse_dbic_protocol_name(protocol_name): diff --git a/tests/test_main.py b/tests/test_main.py index 1d31f13a..8a65400f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -101,6 +101,6 @@ def test_json_dumps_pretty(): pretty = heudiconv.json_dumps_pretty assert pretty({}) == "{}" assert pretty({"a": -1, "b": "123", "c": [1, 2, 3], "d": ["1.0", "2.0"]}) \ - == '{\n "a": -1, \n "b": "123", \n "c": [1, 2, 3], \n "d": ["1.0", "2.0"]\n}' + == '{\n "a": -1,\n "b": "123",\n "c": [1, 2, 3],\n "d": ["1.0", "2.0"]\n}' assert pretty({'a': ["0.3", "-1.9128906358217845e-12", "0.2"]}) \ - == '{\n "a": ["0.3", "-1.9128906358217845e-12", "0.2"]\n}' \ No newline at end of file + == '{\n "a": ["0.3", "-1.9128906358217845e-12", "0.2"]\n}' From 85215c97cbe7edf7685245aa8865e0ae1147d85c Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Tue, 6 Jun 2017 13:10:35 +0100 Subject: [PATCH 159/181] MAINT: tell git to ignore some common cruft --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 0d20b648..52671470 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ *.pyc +.cache/ +.coverage +*.egg-info/ From 2cb6a891697179b34eb569d845123b6994533989 Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Tue, 6 Jun 2017 13:50:52 +0100 Subject: [PATCH 160/181] MAINT: move dcmstack source into requirements.txt Point to Python 3 branch of dcmstack for requirements. --- .travis.yml | 3 --- requirements.txt | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 73457cf3..b11f94c8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,9 +26,6 @@ before_install: - virtualenv --python=python venv - source venv/bin/activate - python --version # just to check - # Have to install dcmstack from git - # This is a branch with changes for Python 3 - - pip install git+git://github.com/ghisvail/dcmstack@master - pip install -r dev-requirements.txt - pip install datalad - pip install codecov pytest diff --git a/requirements.txt b/requirements.txt index 79d31ffe..342b4e58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pydicom -dcmstack +# This is a dcmstack branch with changes for Python 3 +git+git://github.com/ghisvail/dcmstack@master rdflib nipype From 7801ded83b68b62b899710ae316b32d6b40346c8 Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Tue, 6 Jun 2017 14:14:24 +0100 Subject: [PATCH 161/181] MAINT: rdflib is requirement of nipype --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 342b4e58..0bca3cc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ pydicom # This is a dcmstack branch with changes for Python 3 git+git://github.com/ghisvail/dcmstack@master -rdflib nipype From 6348fc1a27d627ce31c21a0cbeee5c0ee9f21f7d Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 22:18:51 -0700 Subject: [PATCH 162/181] BF: add missing import re in embedder --- bin/heudiconv | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/heudiconv b/bin/heudiconv index 8de37ccb..83999419 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -618,6 +618,7 @@ def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False, min_ import nibabel as nb import os import json + import re meta_info = {} if not min_meta: import dcmstack as ds From 25e77d58e27f51cd8426b23ee4a3c70665bc8cc5 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 18:35:25 -0700 Subject: [PATCH 163/181] Add more requirements --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index 0bca3cc3..fb28d827 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,6 @@ pydicom # This is a dcmstack branch with changes for Python 3 git+git://github.com/ghisvail/dcmstack@master nipype +inotify +datalad +tinydb \ No newline at end of file From 84c273cdab88a5417624aa9bbb25abb91c611cb0 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 18:35:41 -0700 Subject: [PATCH 164/181] BF: do not test with bids flag if using convertall --- tests/test_heuristics.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index d52f259b..d3c1aed5 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -24,9 +24,13 @@ def test_smoke_converall(tmpdir): # should produce the same results ]) def test_dbic_bids_largely_smoke(tmpdir, heuristic, invocation): + is_bids = True if heuristic == 'dbic_bids' else False + arg = "-f heuristics/%s.py -c dcm2niix -o %s" % (heuristic, tmpdir) + if is_bids: + arg += " -b" + arg += " --datalad " args = ( - ("-f heuristics/%s.py -c dcm2niix -o %s -b --datalad " % (heuristic, tmpdir)) - + invocation + arg + invocation ).split(' ') if heuristic != 'dbic_bids' and invocation == 'tests/data': # none other heuristic has mighty infotoids atm From 16000e8dd2336bad11e622b672baca4c630eae8b Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 19:06:19 -0700 Subject: [PATCH 165/181] NF: save scan_keys tsv file --- bin/heudiconv | 45 ++++++++++++++++++++++++++++++++++++++++ tests/test_heuristics.py | 29 ++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/bin/heudiconv b/bin/heudiconv index 83999419..64ed680e 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -22,6 +22,7 @@ __version__ = '0.2' import argparse from glob import glob +import csv import inspect import json import os @@ -35,10 +36,12 @@ from copy import deepcopy from collections import namedtuple from collections import defaultdict from collections import OrderedDict as ordereddict +from datetime import datetime from os.path import isdir from os.path import basename from os.path import dirname from os.path import exists +from os.path import join as pjoin PY3 = sys.version_info[0] >= 3 @@ -899,6 +902,10 @@ def convert(items, symlink=True, converter=None, ) continue + # save acquisition time information if it's BIDS + # at this point we still have acquisition date + if is_bids: + save_scans_key(items, outname_bids_files) # Fix up and unify BIDS files tuneup_bids_json_files(outname_bids_files) # we should provide specific handling for fmap, @@ -935,6 +942,44 @@ def convert(items, symlink=True, converter=None, shutil.rmtree(tmpdir) +def save_scans_key(items, outname_bids_files): + import dicom as dcm + import dcmstack as ds + header = ['filename', 'acq_time', 'operator'] + rows = [] + for item, outname_bids_file in zip(items, outname_bids_files): + dcm_fn = item[-1][0] + mw = ds.wrapper_from_data(dcm.read_file(dcm_fn, force=True)) + # we need to store filenames and acquisition times + time = mw.dcm_data.AcquisitionTime + hms = time.split('.')[0] + time = [int(hms[sl]) for sl in [slice(2), + slice(2, 4), + slice(4, len(hms))]] + date = mw.dcm_data.AcquisitionDate + date = [int(date[sl]) for sl in [slice(4), + slice(4, 6), + slice(6, len(date))]] + dt = datetime(date[0], date[1], date[2], time[0], time[1]) + acq_time = dt.isoformat() + f_name = '/'.join(outname_bids_file.split('/')[-2:]) + f_name = f_name.replace('json', 'nii.gz') + rows.append((f_name, acq_time, mw.dcm_data.PerformingPhysicianName)) + # where should we store it? + output_dir = dirname(dirname(outname_bids_file)) + # get subject info + subj_pattern = re.compile('(sub-[a-zA-Z0-9]*)') + subj = subj_pattern.findall(f_name) + assert(len(subj) >= 1) + subj = subj[0] + + # save + with open(pjoin(output_dir, '{0}_scans.tsv'.format(subj)), 'w') as csvfile: + writer = csv.writer(csvfile, delimiter='\t') + writer.writerow(header) + writer.writerows(rows) + + def tuneup_bids_json_files(json_files): """Given a list of BIDS .json files, e.g. """ if not json_files: diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index d3c1aed5..64334952 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -4,6 +4,11 @@ from mock import patch from six.moves import StringIO +from glob import glob +from os.path import join as pjoin, dirname +import csv +import re + import pytest from datalad.api import Dataset @@ -64,6 +69,30 @@ def test_dbic_bids_largely_smoke(tmpdir, heuristic, invocation): assert head == ds.repo.get_hexsha() +@pytest.mark.parametrize( + 'invocation', [ + "tests/data", # our new way with automated groupping + ]) +def test_scans_keys_dbic_bids(tmpdir, invocation): + args = "-f heuristics/dbic_bids.py -c dcm2niix -o %s -b " % tmpdir + args += invocation + heudiconv.main(args.split()) + # for now check it exists + scans_keys = glob(pjoin(tmpdir.strpath, '*/*/*/*/*.tsv')) + assert(len(scans_keys) == 1) + with open(scans_keys[0]) as f: + reader = csv.reader(f, delimiter='\t') + for i, row in enumerate(reader): + if i == 0: + assert(row == ['filename', 'acq_time', 'operator']) + assert(len(row) == 3) + if i != 0: + assert(os.path.exists(pjoin(dirname(scans_keys[0]), row[0]))) + assert(re.match( + '^[\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}$', + row[1])) + + @patch('sys.stdout', new_callable=StringIO) def test_ls(stdout): args = "-f heuristics/dbic_bids.py --command ls tests/data".split(' ') From ec813e98544487449a4e11c64871078e457398f0 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 20:08:16 -0700 Subject: [PATCH 166/181] Simplify parsing of date --- bin/heudiconv | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 64ed680e..ab2b9ce8 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -951,19 +951,15 @@ def save_scans_key(items, outname_bids_files): dcm_fn = item[-1][0] mw = ds.wrapper_from_data(dcm.read_file(dcm_fn, force=True)) # we need to store filenames and acquisition times - time = mw.dcm_data.AcquisitionTime - hms = time.split('.')[0] - time = [int(hms[sl]) for sl in [slice(2), - slice(2, 4), - slice(4, len(hms))]] + # parse date and time and get it into isoformat date = mw.dcm_data.AcquisitionDate - date = [int(date[sl]) for sl in [slice(4), - slice(4, 6), - slice(6, len(date))]] - dt = datetime(date[0], date[1], date[2], time[0], time[1]) - acq_time = dt.isoformat() + time = mw.dcm_data.AcquisitionTime.split('.')[0] + dt = time + date + acq_time = datetime.strptime(dt, '%H%M%S%Y%m%d').isoformat() + # get filenames f_name = '/'.join(outname_bids_file.split('/')[-2:]) f_name = f_name.replace('json', 'nii.gz') + # store it rows.append((f_name, acq_time, mw.dcm_data.PerformingPhysicianName)) # where should we store it? output_dir = dirname(dirname(outname_bids_file)) From c604256095d873894da25f6b309279948bb90e0b Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 20:09:27 -0700 Subject: [PATCH 167/181] Name variables that make sense --- bin/heudiconv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index ab2b9ce8..e4102628 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -954,8 +954,8 @@ def save_scans_key(items, outname_bids_files): # parse date and time and get it into isoformat date = mw.dcm_data.AcquisitionDate time = mw.dcm_data.AcquisitionTime.split('.')[0] - dt = time + date - acq_time = datetime.strptime(dt, '%H%M%S%Y%m%d').isoformat() + td = time + date + acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat() # get filenames f_name = '/'.join(outname_bids_file.split('/')[-2:]) f_name = f_name.replace('json', 'nii.gz') From a3beffca181080aa1da9cfb24e946a9cad440703 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 20:23:51 -0700 Subject: [PATCH 168/181] Add randomly generated column to avoid figuring out date from hash --- bin/heudiconv | 9 +++++++-- tests/test_heuristics.py | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index e4102628..d6dd9a61 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -43,6 +43,8 @@ from os.path import dirname from os.path import exists from os.path import join as pjoin +from random import sample + PY3 = sys.version_info[0] >= 3 import logging @@ -945,7 +947,7 @@ def convert(items, symlink=True, converter=None, def save_scans_key(items, outname_bids_files): import dicom as dcm import dcmstack as ds - header = ['filename', 'acq_time', 'operator'] + header = ['filename', 'acq_time', 'operator', 'randstr'] rows = [] for item, outname_bids_file in zip(items, outname_bids_files): dcm_fn = item[-1][0] @@ -960,7 +962,10 @@ def save_scans_key(items, outname_bids_files): f_name = '/'.join(outname_bids_file.split('/')[-2:]) f_name = f_name.replace('json', 'nii.gz') # store it - rows.append((f_name, acq_time, mw.dcm_data.PerformingPhysicianName)) + randstr = ''.join(map(chr, sample(k=8, population=range(33, 127)))) + rows.append( + (f_name, acq_time, mw.dcm_data.PerformingPhysicianName, randstr) + ) # where should we store it? output_dir = dirname(dirname(outname_bids_file)) # get subject info diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index 64334952..1bd70d51 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -84,8 +84,8 @@ def test_scans_keys_dbic_bids(tmpdir, invocation): reader = csv.reader(f, delimiter='\t') for i, row in enumerate(reader): if i == 0: - assert(row == ['filename', 'acq_time', 'operator']) - assert(len(row) == 3) + assert(row == ['filename', 'acq_time', 'operator', 'randstr']) + assert(len(row) == 4) if i != 0: assert(os.path.exists(pjoin(dirname(scans_keys[0]), row[0]))) assert(re.match( From 12c00aac4d9c07ddc21622c0cdb1e07283be0cb1 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 20:53:45 -0700 Subject: [PATCH 169/181] Extract function to get row info and add test --- bin/heudiconv | 32 +++++++++++++++++++------------- tests/test_main.py | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index d6dd9a61..162df1b2 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -23,6 +23,8 @@ __version__ = '0.2' import argparse from glob import glob import csv +import dicom as dcm +import dcmstack as ds import inspect import json import os @@ -944,27 +946,31 @@ def convert(items, symlink=True, converter=None, shutil.rmtree(tmpdir) +def get_formatted_scans_key_row(item, outname_bids_files): + dcm_fn = item[-1][0] + mw = ds.wrapper_from_data(dcm.read_file(dcm_fn, stop_before_pixels=True)) + # we need to store filenames and acquisition times + # parse date and time and get it into isoformat + date = mw.dcm_data.AcquisitionDate + time = mw.dcm_data.AcquisitionTime.split('.')[0] + td = time + date + acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat() + # add random string + randstr = ''.join(map(chr, sample(k=8, population=range(33, 127)))) + row = [acq_time, mw.dcm_data.PerformingPhysicianName, randstr] + return row + + def save_scans_key(items, outname_bids_files): - import dicom as dcm - import dcmstack as ds header = ['filename', 'acq_time', 'operator', 'randstr'] rows = [] + import pdb; pdb.set_trace() for item, outname_bids_file in zip(items, outname_bids_files): - dcm_fn = item[-1][0] - mw = ds.wrapper_from_data(dcm.read_file(dcm_fn, force=True)) - # we need to store filenames and acquisition times - # parse date and time and get it into isoformat - date = mw.dcm_data.AcquisitionDate - time = mw.dcm_data.AcquisitionTime.split('.')[0] - td = time + date - acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat() # get filenames f_name = '/'.join(outname_bids_file.split('/')[-2:]) f_name = f_name.replace('json', 'nii.gz') - # store it - randstr = ''.join(map(chr, sample(k=8, population=range(33, 127)))) rows.append( - (f_name, acq_time, mw.dcm_data.PerformingPhysicianName, randstr) + [f_name] + get_formatted_scans_key_row(item, outname_bids_file) ) # where should we store it? output_dir = dirname(dirname(outname_bids_file)) diff --git a/tests/test_main.py b/tests/test_main.py index 8a65400f..a66072f5 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -97,6 +97,7 @@ def test_prepare_for_datalad(tmpdir): assert not ds.repo.is_under_annex(f) assert not ds.repo.is_under_annex('.gitattributes') + def test_json_dumps_pretty(): pretty = heudiconv.json_dumps_pretty assert pretty({}) == "{}" @@ -104,3 +105,21 @@ def test_json_dumps_pretty(): == '{\n "a": -1,\n "b": "123",\n "c": [1, 2, 3],\n "d": ["1.0", "2.0"]\n}' assert pretty({'a': ["0.3", "-1.9128906358217845e-12", "0.2"]}) \ == '{\n "a": ["0.3", "-1.9128906358217845e-12", "0.2"]\n}' + + +def test_get_formatted_scans_key_row(): + item = [ + ('tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm', + ('nii.gz', 'dicom'), + ['tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm']) + ] + outname_bids_file = '/a/path/Halchenko/Yarik/950_bids_test4/sub-phantom1sid1/fmap/sub-phantom1sid1_acq-3mm_phasediff.json' + + row = heudiconv.get_formatted_scans_key_row(item, outname_bids_file) + assert(len(row) == 3) + assert(row[0] == '2016-10-14T09:26:34') + assert(row[1] == '') + randstr1 = row[2] + row = heudiconv.get_formatted_scans_key_row(item, outname_bids_file) + randstr2 = row[2] + assert(randstr1 != randstr2) From 40240471908f80b12d68b5aeaadccbae45fe835f Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Fri, 23 Jun 2017 22:21:27 -0700 Subject: [PATCH 170/181] Remove forgotten pdb --- bin/heudiconv | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/heudiconv b/bin/heudiconv index 162df1b2..28591ea4 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -964,7 +964,6 @@ def get_formatted_scans_key_row(item, outname_bids_files): def save_scans_key(items, outname_bids_files): header = ['filename', 'acq_time', 'operator', 'randstr'] rows = [] - import pdb; pdb.set_trace() for item, outname_bids_file in zip(items, outname_bids_files): # get filenames f_name = '/'.join(outname_bids_file.split('/')[-2:]) From d7b8df412340f80a89da046fc211f8d1fb840328 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 24 Jun 2017 10:02:17 -0700 Subject: [PATCH 171/181] Fix dbic_bids.py compatibility with python 3.x --- bin/heudiconv | 1 - heuristics/dbic_bids.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 28591ea4..d708b00d 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1725,7 +1725,6 @@ def _main(args): # Load heuristic -- better do it asap to make sure it loads correctly # heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) - # TODO: Move into a function! study_sessions = get_study_sessions( dicom_dir_template, files_opt, diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 372e2b6b..046f125b 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -560,8 +560,7 @@ def infotoids(seqinfos, outdir): # e.g. {date} session_ = session_.format(**s._asdict()) ses_markers.append(session_) - - ses_markers = filter(bool, ses_markers) # only present ones + ses_markers = list(filter(bool, ses_markers)) # only present ones session = None if ses_markers: # we have a session or possibly more than one even @@ -628,7 +627,6 @@ def sanitize_str(value): def parse_dbic_protocol_name(protocol_name): """Parse protocol name according to our convention with minimal set of fixups """ - # Since Yarik didn't know better place to put it in, but could migrate outside # at some point protocol_name = protocol_name.replace("anat_T1w", "anat-T1w") From f48fc9b9b7d49730747f5754cc844435e618fe46 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 24 Jun 2017 10:11:14 -0700 Subject: [PATCH 172/181] Update dcmstack source to fix missing importsys --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fb28d827..27055c59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ pydicom # This is a dcmstack branch with changes for Python 3 -git+git://github.com/ghisvail/dcmstack@master +git+git://github.com/mvdoc/dcmstack@bf/importsys nipype inotify datalad -tinydb \ No newline at end of file +tinydb From 079e63d9166a3a48c535c46faf79990bc4d9561f Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 24 Jun 2017 11:18:44 -0700 Subject: [PATCH 173/181] Clean up requirements and setup.py --- requirements.txt | 8 +++----- setup.py | 9 ++++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 27055c59..4ce7ccd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ -pydicom +.[full] # This is a dcmstack branch with changes for Python 3 +# sent PR to main repo, TODO: check if merged +# https://github.com/ghisvail/dcmstack/pull/1 git+git://github.com/mvdoc/dcmstack@bf/importsys -nipype -inotify -datalad -tinydb diff --git a/setup.py b/setup.py index 1acee7da..8a1a4ae4 100755 --- a/setup.py +++ b/setup.py @@ -33,12 +33,19 @@ def findsome(subdir, extensions): requires = { 'core': [ 'nibabel', - # TODO: migrate more from requirements.txt + 'pydicom', ], 'tests': [ 'six', 'nose', ], + 'monitor': [ + 'inotify', + 'tinydb' + ], + 'datalad': [ + 'datalad' + ] } requires['full'] = sum(list(requires.values()), []) From 463d9027d23f67788c1a9120fa9ced5e3ad7937a Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 24 Jun 2017 15:13:25 -0700 Subject: [PATCH 174/181] Reload scans keys if existing, start adding tests --- bin/heudiconv | 76 +++++++++++++++++++++++++++++++++++++++------- tests/test_main.py | 17 +++++++++-- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index d708b00d..0b310625 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -42,7 +42,7 @@ from datetime import datetime from os.path import isdir from os.path import basename from os.path import dirname -from os.path import exists +from os.path import lexists, exists from os.path import join as pjoin from random import sample @@ -946,7 +946,18 @@ def convert(items, symlink=True, converter=None, shutil.rmtree(tmpdir) -def get_formatted_scans_key_row(item, outname_bids_files): +def get_formatted_scans_key_row(item): + """ + Parameters + ---------- + item + + Returns + ------- + row: list + [ISO acquisition time, performing physician name, random string] + + """ dcm_fn = item[-1][0] mw = ds.wrapper_from_data(dcm.read_file(dcm_fn, stop_before_pixels=True)) # we need to store filenames and acquisition times @@ -961,16 +972,60 @@ def get_formatted_scans_key_row(item, outname_bids_files): return row +def add_rows_to_scans_keys_file(fn, newrows): + """ + Add new rows to file fn for scans key filename + + Parameters + ---------- + fn: filename + newrows: extra rows to add + dict fn: [acquisition time, referring physician, random string] + """ + if lexists(fn): + with open(fn, 'r') as csvfile: + reader = csv.reader(csvfile, delimiter='\t') + existing_rows = [row[0] for row in reader] + # skip header + fnames2info = {row[0]: row[1:] for row in existing_rows[1:]} + + newrows_key = newrows.key() + newrows_toadd = list(set(newrows_key) - set(fnames2info.keys())) + for key_toadd in newrows_toadd: + fnames2info[key_toadd] = newrows[key_toadd] + header = [] + # remove + os.unlink(fn) + else: + header = ['filename', 'acq_time', 'operator', 'randstr'] + fnames2info = newrows + + # save + with open(fn, 'a') as csvfile: + writer = csv.writer(csvfile, delimiter='\t') + if header: + writer.writerow(header) + for key in sorted(fnames2info.keys()): + writer.writerow([key] + fnames2info[key]) + + def save_scans_key(items, outname_bids_files): - header = ['filename', 'acq_time', 'operator', 'randstr'] - rows = [] + """ + Parameters + ---------- + items: + outname_bids_files: + + Returns + ------- + + """ + rows = dict() for item, outname_bids_file in zip(items, outname_bids_files): # get filenames f_name = '/'.join(outname_bids_file.split('/')[-2:]) f_name = f_name.replace('json', 'nii.gz') - rows.append( - [f_name] + get_formatted_scans_key_row(item, outname_bids_file) - ) + rows[f_name] = get_formatted_scans_key_row(item) # where should we store it? output_dir = dirname(dirname(outname_bids_file)) # get subject info @@ -980,10 +1035,9 @@ def save_scans_key(items, outname_bids_files): subj = subj[0] # save - with open(pjoin(output_dir, '{0}_scans.tsv'.format(subj)), 'w') as csvfile: - writer = csv.writer(csvfile, delimiter='\t') - writer.writerow(header) - writer.writerows(rows) + add_rows_to_scans_keys_file( + pjoin(output_dir, '{0}_scans.tsv'.format(subj)), + rows) def tuneup_bids_json_files(json_files): diff --git a/tests/test_main.py b/tests/test_main.py index a66072f5..fc64297c 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -115,11 +115,24 @@ def test_get_formatted_scans_key_row(): ] outname_bids_file = '/a/path/Halchenko/Yarik/950_bids_test4/sub-phantom1sid1/fmap/sub-phantom1sid1_acq-3mm_phasediff.json' - row = heudiconv.get_formatted_scans_key_row(item, outname_bids_file) + row = heudiconv.get_formatted_scans_key_row(item) assert(len(row) == 3) assert(row[0] == '2016-10-14T09:26:34') assert(row[1] == '') randstr1 = row[2] - row = heudiconv.get_formatted_scans_key_row(item, outname_bids_file) + row = heudiconv.get_formatted_scans_key_row(item) randstr2 = row[2] assert(randstr1 != randstr2) + + +# TODO: finish this +def test_add_rows_to_scans_keys_file(tmpdir): + fn = opj(tmpdir.strpath, 'file.tsv') + rows = { + 'my_file.nii.gz': ['2016adsfasd', '', 'fasadfasdf'], + 'another_file.nii.gz': ['2018xxxxx', '', 'fasadfasdf'] + } + heudiconv.add_rows_to_scans_keys_file(fn, rows) + + #with open(fn, 'r') as csvfile: + From 382a815715585e882473db3afeb580472704117e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sat, 24 Jun 2017 15:23:37 -0700 Subject: [PATCH 175/181] ENH: add metadata for sensitive materials --- bin/heudiconv | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 8de37ccb..5ee5f01a 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1515,6 +1515,30 @@ def add_participant_record(studydir, subject, age, sex): + '\n') +def mark_sensitive(ds, path_glob=None): + """ + + Parameters + ---------- + ds : Dataset to operate on + path_glob : str, optional + glob of the paths within dataset to work on + + Returns + ------- + None + """ + sens_kwargs = dict( + init=[('distribution-restrictions', 'sensitive')] + ) + if path_glob: + paths = glob(opj(ds.path, path_glob)) + if not paths: + return + sens_kwargs['path'] = paths + ds.metadata(**sens_kwargs) + + def add_to_datalad(topdir, studydir, msg=None, bids=False): """Do all necessary preparations (if were not done before) and save """ @@ -1547,14 +1571,15 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): superds = ds create_file_if_missing( - opj(studydir, '.gitattributes'), - """\ + opj(studydir, '.gitattributes'), + """\ * annex.largefiles=(largerthan=100kb) *.json annex.largefiles=nothing *.txt annex.largefiles=nothing *.tsv annex.largefiles=nothing -*.nii.gz annex.largefiles=(largerthan=0kb) -*.tgz annex.largefiles=(largerthan=0kb) +*.nii.gz annex.largefiles=anything +*.tgz annex.largefiles=anything +*_scans.tsv annex.largefiles=anything """) # so for mortals it just looks like a regular directory! if not ds.config.get('annex.thin'): @@ -1567,16 +1592,39 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): ds = Dataset(studydir) # Add doesn't have all the options of save such as msg and supers ds.add('.gitattributes', to_git=True, save=False) + dsh = None if os.path.lexists(os.path.join(ds.path, '.heudiconv')): - ds.add('.heudiconv', to_git=True, save=False) + dsh = Dataset(opj(ds.path, '.heudiconv')) + if not dsh.is_installed(): + # we need to create it first + dsh = ds.create(path='.heudiconv', + force=True, + shared_access='all') + # Since .heudiconv could contain sensitive information + # we place all files under annex and then add + create_file_if_missing( + opj(dsh.path, '.gitattributes'), + """* annex.largefiles=anything + """) + dsh.add('.gitattributes', message="Added gitattributes to place all content under annex") ds.add('.', recursive=True, save=False, # not in effect! ? #annex_add_opts=['--include-dotfiles'] ) + # Provide metadata for sensitive information + mark_sensitive(ds, 'sourcedata') + mark_sensitive(ds, '*_scans.tsv') # top level + mark_sensitive(ds, '*/*_scans.tsv') # within subj + mark_sensitive(ds, '*/anat') # within subj + mark_sensitive(ds, '*/*/anat') # within subj/ses + if dsh: + mark_sensitive(dsh) # entire .heudiconv! + # import pdb; pdb.set_trace() + dsh.save(message=msg) ds.save(message=msg, recursive=True, super_datasets=True) - assert not ds.repo.dirty + assert not ds.repo.dirty # TODO: they are still appearing as native annex symlinked beasts """ TODOs: From 652d53ca87a3c90137a9379a20c38c90919c8207 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 24 Jun 2017 16:26:18 -0700 Subject: [PATCH 176/181] BF: forgotten nipype --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 8a1a4ae4..e8c52f22 100755 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ def findsome(subdir, extensions): 'core': [ 'nibabel', 'pydicom', + 'nipype' ], 'tests': [ 'six', From 97fbd33397a28f135181e3a88ebffb18395cf731 Mon Sep 17 00:00:00 2001 From: Matteo Visconti dOC Date: Sat, 24 Jun 2017 18:17:51 -0700 Subject: [PATCH 177/181] Add test for add_rows_to_scans_keys_file --- bin/heudiconv | 10 ++++------ tests/test_main.py | 23 ++++++++++++++++++++++- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 0b310625..e8039cba 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -985,26 +985,24 @@ def add_rows_to_scans_keys_file(fn, newrows): if lexists(fn): with open(fn, 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t') - existing_rows = [row[0] for row in reader] + existing_rows = [row for row in reader] # skip header fnames2info = {row[0]: row[1:] for row in existing_rows[1:]} - newrows_key = newrows.key() + newrows_key = newrows.keys() newrows_toadd = list(set(newrows_key) - set(fnames2info.keys())) for key_toadd in newrows_toadd: fnames2info[key_toadd] = newrows[key_toadd] - header = [] # remove os.unlink(fn) else: - header = ['filename', 'acq_time', 'operator', 'randstr'] fnames2info = newrows + header = ['filename', 'acq_time', 'operator', 'randstr'] # save with open(fn, 'a') as csvfile: writer = csv.writer(csvfile, delimiter='\t') - if header: - writer.writerow(header) + writer.writerow(header) for key in sorted(fnames2info.keys()): writer.writerow([key] + fnames2info[key]) diff --git a/tests/test_main.py b/tests/test_main.py index fc64297c..8d29d11e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,3 +1,4 @@ +import csv import os import pytest import sys @@ -134,5 +135,25 @@ def test_add_rows_to_scans_keys_file(tmpdir): } heudiconv.add_rows_to_scans_keys_file(fn, rows) - #with open(fn, 'r') as csvfile: + def _check_rows(fn, rows): + with open(fn, 'r') as csvfile: + reader = csv.reader(csvfile, delimiter='\t') + rows_loaded = [] + for row in reader: + rows_loaded.append(row) + for i, row_ in enumerate(rows_loaded): + if i == 0: + assert(row_ == ['filename', 'acq_time', 'operator', 'randstr']) + else: + assert(rows[row_[0]] == row_[1:]) + + _check_rows(fn, rows) + # add a new one + extra_rows = { + 'a_new_file.nii.gz': ['2016adsfasd23', '', 'fasadfasdf'], + 'my_file.nii.gz': ['2016adsfasd', '', 'fasadfasdf'], + 'another_file.nii.gz': ['2018xxxxx', '', 'fasadfasdf'] + } + heudiconv.add_rows_to_scans_keys_file(fn, extra_rows) + _check_rows(fn, extra_rows) From a169e0b2877c4053e66531038dfa70ca27bb9005 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 10 Jul 2017 10:00:00 -0400 Subject: [PATCH 178/181] ENH: add .gitattributes only if was missing --- bin/heudiconv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 6b4aeb91..1c3984f9 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1708,16 +1708,17 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): shared_access='all') # Since .heudiconv could contain sensitive information # we place all files under annex and then add - create_file_if_missing( + if create_file_if_missing( opj(dsh.path, '.gitattributes'), """* annex.largefiles=anything - """) - dsh.add('.gitattributes', message="Added gitattributes to place all content under annex") + """): + dsh.add('.gitattributes', message="Added gitattributes to place all content under annex") ds.add('.', recursive=True, save=False, # not in effect! ? #annex_add_opts=['--include-dotfiles'] ) + # TODO: filter for only changed files? # Provide metadata for sensitive information mark_sensitive(ds, 'sourcedata') mark_sensitive(ds, '*_scans.tsv') # top level @@ -1738,7 +1739,6 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): - unlock (thin will be in effect) - save/commit (does modechange 120000 => 100644 - - we should mark dicoms and anatomicals as distribution-restricted - could potentially somehow automate that all: http://git-annex.branchable.com/tips/automatically_adding_metadata/ - possibly even make separate sub-datasets for originaldata, derivatives ? From 25729f451d51c5c94e3863b88f91b8fdc1811732 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 10 Jul 2017 10:29:45 -0400 Subject: [PATCH 179/181] ENH: addressing Matthew's comments -- should be no functional changes --- bin/heudiconv | 73 +++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 1c3984f9..24d2e743 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -2,7 +2,9 @@ """Convert DICOM dirs based on heuristic info -This script uses DicomStack and mri_convert to convert DICOM directories. +This script uses the dcmstack package and dcm2niix tool to convert DICOM +directories or tarballs into collections of NIfTI files following pre-defined +heuristic(s). It has multiple modes of operation @@ -18,7 +20,7 @@ It has multiple modes of operation DICOMs are sorted based on study UID, and layed out using specified heuristic """ -__version__ = '0.2' +__version__ = '0.3' import argparse from glob import glob @@ -124,6 +126,8 @@ StudySessionInfo = namedtuple( ) +# TODO: RF to avoid package-level global structure, and be more friendly in +# case of refactoring of heudiconv into a proper Python package/module class TempDirs(object): """A helper to centralize handling and cleanup of dirs""" @@ -296,7 +300,9 @@ def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False) find_files.__doc__ %= (_VCS_REGEX,) -def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='studyUID'): +def group_dicoms_into_seqinfos( + files, file_filter=None, dcmfilter=None, grouping='studyUID' +): """Process list of dicoms and return seqinfo and file group `seqinfo` contains per-sequence extract of fields from DICOMs which @@ -304,15 +310,14 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud Parameters ---------- - fl : list of str + files : list of str List of files to consider - flfilter : callable, optional - Applied to each of fl. Should return True if file needs to be kept, - False otherwise. Used to filter fl + file_filter : callable, optional + Applied to each item of filenames. Should return True if file needs to be + kept, False otherwise. dcmfilter : callable, optional - If called on dcm_data and returns True, it is used to set - series_id - grouping : str ('studyUID', 'accession_number') or None, optional + If called on dcm_data and returns True, it is used to set series_id + grouping : {'studyUID', 'accession_number', None}, optional what to group by: studyUID or accession_number Returns @@ -328,24 +333,25 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud raise ValueError('I do not know how to group by {0}'.format(grouping)) per_studyUID = grouping == 'studyUID' per_accession_number = grouping == 'accession_number' - lgr.info("Analyzing %d dicoms", len(fl)) + lgr.info("Analyzing %d dicoms", len(files)) import dcmstack as ds import dicom as dcm groups = [[], []] mwgroup = [] - studyUID = None # for sanity check that all DICOMs came from the same - # "study". If not -- what is the use-case? (interrupted acquisition?) - # and how would then we deal with series numbers - # which would differ already - if flfilter: - nfl_before = len(fl) - fl = list(filter(flfilter, fl)) - nfl_after = len(fl) + studyUID = None + # for sanity check that all DICOMs came from the same + # "study". If not -- what is the use-case? (interrupted acquisition?) + # and how would then we deal with series numbers + # which would differ already + if file_filter: + nfl_before = len(files) + files = list(filter(file_filter, files)) + nfl_after = len(files) lgr.info('Filtering out {0} dicoms based on their filename'.format( nfl_before-nfl_after)) - for fidx, filename in enumerate(fl): + for fidx, filename in enumerate(files): # TODO after getting a regression test check if the same behavior # with stop_before_pixels=True mw = ds.wrapper_from_data(dcm.read_file(filename, force=True)) @@ -357,30 +363,29 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud pass try: - studyUID_ = mw.dcm_data.StudyInstanceUID + file_studyUID = mw.dcm_data.StudyInstanceUID except AttributeError: - #import pdb; pdb.set_trace() lgr.info("File %s is missing any StudyInstanceUID" % filename) - studyUID_ = None + file_studyUID = None #continue try: series_id = (int(mw.dcm_data.SeriesNumber), mw.dcm_data.ProtocolName) - studyUID_ = mw.dcm_data.StudyInstanceUID + file_studyUID = mw.dcm_data.StudyInstanceUID if not per_studyUID: # verify that we are working with a single study if studyUID is None: - studyUID = studyUID_ + studyUID = file_studyUID elif not per_accession_number: - assert studyUID == studyUID_ + assert studyUID == file_studyUID except AttributeError as exc: lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', filename, exc) # not a normal DICOM -> ignore series_id = (-1, 'none') - studyUID_ = None + file_studyUID = None if not series_id[0] < 0: if dcmfilter is not None and dcmfilter(mw.dcm_data): @@ -403,7 +408,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud series_id = (-1, mw.dcm_data.ProtocolName) if per_studyUID: - series_id = series_id + (studyUID_,) + series_id = series_id + (file_studyUID,) #print fidx, N, filename @@ -413,13 +418,13 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud #print idx, same, groups[idx][0] if same: # the same series should have the same study uuid - assert mwgroup[idx].dcm_data.get('StudyInstanceUID', None) == studyUID_ + assert mwgroup[idx].dcm_data.get('StudyInstanceUID', None) == file_studyUID ingrp = True if series_id[0] >= 0: series_id = (mwgroup[idx].dcm_data.SeriesNumber, mwgroup[idx].dcm_data.ProtocolName) if per_studyUID: - series_id = series_id + (studyUID_,) + series_id = series_id + (file_studyUID,) groups[0].append(series_id) groups[1].append(idx) @@ -445,7 +450,7 @@ def group_dicoms_into_seqinfos(fl, flfilter=None, dcmfilter=None, grouping='stud # nothing to see here, just move on continue dcminfo = mw.dcm_data - files = [fl[i] for i, s in enumerate(groups[0]) if s == series_id] + files = [files[i] for i, s in enumerate(groups[0]) if s == series_id] # turn the series_id into a human-readable string -- string is needed # for JSON storage later on if per_studyUID: @@ -1261,7 +1266,7 @@ def convert_dicoms(sid, if dicoms: seqinfo = group_dicoms_into_seqinfos( dicoms, - flfilter=getattr(heuristic, 'filter_files', None), + file_filter=getattr(heuristic, 'filter_files', None), dcmfilter=getattr(heuristic, 'filter_dicom', None), grouping=None, # no groupping ) @@ -1454,10 +1459,9 @@ def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, # sort all DICOMS using heuristic # TODO: this one is not groupping by StudyUID but may be we should! - #import pdb; pdb.set_trace() seqinfo_dict = group_dicoms_into_seqinfos( files_, - flfilter=getattr(heuristic, 'filter_files', None), + file_filter=getattr(heuristic, 'filter_files', None), dcmfilter=getattr(heuristic, 'filter_dicom', None), grouping=grouping) @@ -1727,7 +1731,6 @@ def add_to_datalad(topdir, studydir, msg=None, bids=False): mark_sensitive(ds, '*/*/anat') # within subj/ses if dsh: mark_sensitive(dsh) # entire .heudiconv! - # import pdb; pdb.set_trace() dsh.save(message=msg) ds.save(message=msg, recursive=True, super_datasets=True) From 4cdb33d83f5d4a4b2b505038fe228bfbf2b95632 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 10 Jul 2017 12:53:50 -0400 Subject: [PATCH 180/181] BF: we need nipype for core and pytest not nose for testing --- .travis.yml | 1 - dev-requirements.txt | 3 +-- setup.py | 3 ++- tox.ini | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index b11f94c8..6b5fdf21 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,6 @@ install: - git config --global user.name "Travis Almighty" script: - # - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests - coverage run `which py.test` -s -v tests heuristics after_success: diff --git a/dev-requirements.txt b/dev-requirements.txt index 78e5af36..26b77f68 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,3 +1,2 @@ -r requirements.txt -six -nose +pytest diff --git a/setup.py b/setup.py index 8a1a4ae4..dca6f021 100755 --- a/setup.py +++ b/setup.py @@ -34,10 +34,11 @@ def findsome(subdir, extensions): 'core': [ 'nibabel', 'pydicom', + 'nipype', ], 'tests': [ 'six', - 'nose', + 'pytest', ], 'monitor': [ 'inotify', diff --git a/tox.ini b/tox.ini index d483a432..5f8f5fcd 100644 --- a/tox.ini +++ b/tox.ini @@ -2,5 +2,5 @@ envlist = py27,py33,py34,py35 [testenv] -commands = nosetests -s -v {posargs} tests +commands = python -m pytest -s -v {posargs} tests deps = -r{toxinidir}/dev-requirements.txt From b55b892ba3c7c645baa458d3a7be75056800df14 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 10 Jul 2017 18:43:46 -0400 Subject: [PATCH 181/181] BF: fixup for recent RF fl -> files which collided with use of files later in the code --- bin/heudiconv | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/heudiconv b/bin/heudiconv index 24d2e743..b8ad96d2 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -450,7 +450,7 @@ def group_dicoms_into_seqinfos( # nothing to see here, just move on continue dcminfo = mw.dcm_data - files = [files[i] for i, s in enumerate(groups[0]) if s == series_id] + series_files = [files[i] for i, s in enumerate(groups[0]) if s == series_id] # turn the series_id into a human-readable string -- string is needed # for JSON storage later on if per_studyUID: @@ -460,7 +460,7 @@ def group_dicoms_into_seqinfos( series_id = '-'.join(map(str, series_id)) - size = list(mw.image_shape) + [len(files)] + size = list(mw.image_shape) + [len(series_files)] total += size[-1] if len(size) < 4: size.append(1) @@ -482,9 +482,9 @@ def group_dicoms_into_seqinfos( or 'MOCO' in image_type info = SeqInfo( total, - os.path.split(files[0])[1], + os.path.split(series_files[0])[1], series_id, - os.path.basename(os.path.dirname(files[0])), + os.path.basename(os.path.dirname(series_files[0])), '-', '-', size[0], size[1], size[2], size[3], TR, TE, @@ -527,13 +527,13 @@ def group_dicoms_into_seqinfos( if per_studyUID: if studyUID not in seqinfo: seqinfo[studyUID] = ordereddict() - seqinfo[studyUID][info] = files + seqinfo[studyUID][info] = series_files elif per_accession_number: if accession_number not in seqinfo: seqinfo[accession_number] = ordereddict() - seqinfo[accession_number][info] = files + seqinfo[accession_number][info] = series_files else: - seqinfo[info] = files + seqinfo[info] = series_files if per_studyUID: lgr.info("Generated sequence info for %d studies with %d entries total",